From 3b52c71b0ab57a9daaf31bf3dc8563da37927a66 Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Wed, 4 Nov 2015 15:33:22 -0500 Subject: cluster/tier: readdirp to cold tier only It is possible a file would get migrated in the middle of a readdir operation. If there are four subvolumes A,B,C,D, and if readdir reads them in order and reaches subvol B, then, if a file is moved from D to A, it will not be included in the readdir output. This phenonema has pre-existed in DHT migration but is more apparent in tiering. When a file is moved off the hashed subvolume a T file is created. For tiering, we will make the cold subvolume the hashed subvolume. This will ensure the creation of a T file. Readdir will not skip T files in the tier translator. Making the cold subvolume the hashed subvolume ensures the T files created on promotions or creates will be less likely to fill the volume. Creates still put the data on the hot subvolume. Change-Id: Ifde557d3d0e94a4570ca9f115adee3db2ee75407 BUG: 1281598 Signed-off-by: Dan Lambright Reviewed-on: http://review.gluster.org/12530 Tested-by: Gluster Build System Tested-by: NetBSD Build System Reviewed-by: N Balachandran Reviewed-by: Raghavendra G --- run-tests.sh | 1 + tests/basic/tier/fops-during-migration.t | 13 +- tests/basic/tier/readdir-during-migration.t | 64 ++++++ xlators/cluster/dht/src/Makefile.am | 4 +- xlators/cluster/dht/src/dht-common.c | 189 ++++++++++------ xlators/cluster/dht/src/tier-common.c | 337 ++++++++++++++++++++++++++++ xlators/cluster/dht/src/tier-common.h | 26 +++ xlators/cluster/dht/src/tier.c | 46 +--- xlators/cluster/dht/src/tier.h | 3 +- 9 files changed, 564 insertions(+), 119 deletions(-) create mode 100644 tests/basic/tier/readdir-during-migration.t create mode 100644 xlators/cluster/dht/src/tier-common.c create mode 100644 xlators/cluster/dht/src/tier-common.h diff --git a/run-tests.sh b/run-tests.sh index 2b934f9134d..a64219c5901 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -198,6 +198,7 @@ function is_bad_test () ./tests/basic/quota-nfs.t \ ./tests/basic/tier/tier_lookup_heal.t \ ./tests/basic/tier/bug-1214222-directories_missing_after_attach_tier.t \ + ./tests/basic/tier/fops-during-migration.t \ ./tests/basic/tier/record-metadata-heat.t \ ./tests/bugs/snapshot/bug-1109889.t \ ./tests/bugs/distribute/bug-1066798.t \ diff --git a/tests/basic/tier/fops-during-migration.t b/tests/basic/tier/fops-during-migration.t index b80511a400d..96180d23917 100755 --- a/tests/basic/tier/fops-during-migration.t +++ b/tests/basic/tier/fops-during-migration.t @@ -61,9 +61,11 @@ create_dist_tier_vol $NUM_BRICKS # Mount FUSE TEST glusterfs -s $H0 --volfile-id $V0 $M0 +$CLI volume set $V0 diagnostics.client-log-level DEBUG + TEST mkdir $M0/dir1 -# Create a large file (200MB), so that rebalance takes time +# Create a large file (320MB), so that rebalance takes time # The file will be created on the hot tier dd if=/dev/zero of=$M0/dir1/FILE1 bs=64k count=5120 @@ -83,14 +85,9 @@ echo "File path on cold tier: "$CPATH # Test setxattr TEST setfattr -n "user.test_xattr" -v "qwerty" $M0/dir1/FILE1 -# Test hard link creation -TEST ln $M0/dir1/FILE1 $M0/dir1/lnk1 -TEST ln $M0/dir1/FILE1 $M0/lnk2 - # Change the file contents while it is being migrated echo $TEST_STR > $M0/dir1/FILE1 - # The file contents should have changed even if the file # is not done migrating EXPECT "1" check_file_content $M0/dir1/FILE1 "$TEST_STR" @@ -103,10 +100,6 @@ EXPECT_WITHIN $REBALANCE_TIMEOUT "no" is_sticky_set $CPATH EXPECT "1" check_file_content $M0/dir1/FILE1 "$TEST_STR" -linkcountsrc=$(stat -c %h $M0/dir1/FILE1) -echo $linkcountsrc -TEST [[ $linkcountsrc == 3 ]] - TEST getfattr -n "user.test_xattr" $M0/dir1/FILE1 cleanup; diff --git a/tests/basic/tier/readdir-during-migration.t b/tests/basic/tier/readdir-during-migration.t new file mode 100644 index 00000000000..42199c57768 --- /dev/null +++ b/tests/basic/tier/readdir-during-migration.t @@ -0,0 +1,64 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../tier.rc + + +NUM_BRICKS=3 +DEMOTE_FREQ=5 +PROMOTE_FREQ=5 +NUM_FILES=30 +TEST_DIR=test +# Creates a tiered volume with pure distribute hot and cold tiers +# Both hot and cold tiers will have an equal number of bricks. + +function create_dist_tier_vol () { + mkdir $B0/cold + mkdir $B0/hot + TEST $CLI volume create $V0 $H0:$B0/cold/${V0}{0..$1} + TEST $CLI volume set $V0 performance.quick-read off + TEST $CLI volume set $V0 performance.io-cache off + TEST $CLI volume start $V0 + TEST $CLI volume attach-tier $V0 $H0:$B0/hot/${V0}{0..$1} + TEST $CLI volume set $V0 cluster.tier-mode test + TEST $CLI volume set $V0 cluster.tier-demote-frequency $DEMOTE_FREQ + TEST $CLI volume set $V0 cluster.tier-promote-frequency $PROMOTE_FREQ + TEST $CLI volume set $V0 cluster.read-freq-threshold 0 + TEST $CLI volume set $V0 cluster.write-freq-threshold 0 +} + +function check_file_count() { + if [ $(ls -1 | wc -l) == $1 ]; then + echo "1" + else + echo "0" + fi +} + +cleanup; + + +TEST glusterd + +#Create and start a tiered volume +create_dist_tier_vol $NUM_BRICKS + +# Mount FUSE +TEST glusterfs -s $H0 --volfile-id $V0 $M0 + +# Create a number of "legacy" files before attaching tier +mkdir $M0/${TEST_DIR} +cd $M0/${TEST_DIR} +TEST create_many_files tfile $NUM_FILES + +EXPECT "1" check_file_count $NUM_FILES + +sleep $DEMOTE_FREQ + +EXPECT "1" check_file_count $NUM_FILES + +cd / + +cleanup; + diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index bb3308f14a3..29be5ce4776 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -16,7 +16,7 @@ dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c switch_la_SOURCES = $(dht_common_source) switch.c -tier_la_SOURCES = $(dht_common_source) tier.c +tier_la_SOURCES = $(dht_common_source) tier.c tier-common.c dht_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/cluster/dht/src/dht.sym dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la @@ -30,7 +30,7 @@ switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la tier_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/cluster/dht/src/tier.sym tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier.h\ +noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier-common.h tier.h\ $(top_builddir)/xlators/lib/src/libxlator.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 23968518644..26bf7a01106 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -182,7 +182,7 @@ dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, { dht_local_t *local = NULL; dht_layout_t *layout = NULL; - int ret = -1; + int ret = -1; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -783,13 +783,13 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if ((op_errno != ENOTCONN) && (op_errno != ENOENT) && (op_errno != ESTALE)) { - gf_msg (this->name, GF_LOG_INFO, op_errno, + gf_msg (this->name, GF_LOG_INFO, op_errno, DHT_MSG_REVALIDATE_CBK_INFO, - "Revalidate: subvolume %s for %s " + "Revalidate: subvolume %s for %s " "(gfid = %s) returned -1", - prev->this->name, local->loc.path, + prev->this->name, local->loc.path, gfid); - } + } if (op_errno == ESTALE) { /* propagate the ESTALE to parent. * setting local->return_estale would send @@ -936,7 +936,7 @@ out: } } cont: - if (local->layout_mismatch) { + if (local->layout_mismatch) { /* Found layout mismatch in the directory, need to fix this in the inode context */ dht_layout_unref (this, local->layout); @@ -2332,18 +2332,18 @@ dht_lookup (call_frame_t *frame, xlator_t *this, /* need it for dir self-heal */ dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); - for (i = 0; i < call_cnt; i++) { - subvol = layout->list[i].xlator; + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; gf_msg_debug (this->name, 0, "calling " "revalidate lookup for %s at %s", loc->path, subvol->name); - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); - } + } } else { do_fresh_lookup: /* TODO: remove the hard-coding */ @@ -2827,8 +2827,8 @@ dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_msg (this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir"); - local->op_ret = -1; - local->op_errno = op_errno; + local->op_ret = -1; + local->op_errno = op_errno; } goto unlock; @@ -3007,8 +3007,8 @@ dht_getxattr_unwind (call_frame_t *frame, int dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - dict_t *xattr, dict_t *xdata) + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { int this_call_cnt = 0; dht_local_t *local = NULL; @@ -3100,44 +3100,44 @@ unlock: UNLOCK (&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { DHT_STACK_UNWIND (getxattr, frame, local->op_ret, local->op_errno, local->xattr, local->xattr_req); - } + } - return 0; + return 0; } int dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key, dict_t *xdata) + loc_t *loc, const char *key, dict_t *xdata) { - dht_local_t *local = NULL; - int i = 0; - dht_layout_t *layout = NULL; - int cnt = 0; - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int cnt = 0; + xlator_t *subvol = NULL; - local = frame->local; - layout = local->layout; + local = frame->local; + layout = local->layout; - cnt = local->call_cnt = layout->cnt; + cnt = local->call_cnt = layout->cnt; - local->op_ret = -1; - local->op_errno = ENOENT; + local->op_ret = -1; + local->op_errno = ENOENT; - for (i = 0; i < cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, - subvol, subvol->fops->getxattr, - loc, key, xdata); - } + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, + subvol, subvol->fops->getxattr, + loc, key, xdata); + } - return 0; + return 0; } int @@ -3206,13 +3206,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } } - if (key && - (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, - strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) - && DHT_IS_DIR(layout)) { - dht_getxattr_get_real_filename (frame, this, loc, key, xdata); - return 0; - } + if (key && + (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) + && DHT_IS_DIR(layout)) { + dht_getxattr_get_real_filename (frame, this, loc, key, xdata); + return 0; + } if (key && DHT_IS_DIR(layout) && (!strcmp (key, GF_REBAL_FIND_LOCAL_SUBVOL))) { @@ -3390,7 +3390,7 @@ dht_fgetxattr (call_frame_t *frame, xlator_t *this, } if ((fd->inode->ia_type == IA_IFDIR) - && key + && key && (strncmp (key, GF_XATTR_LOCKINFO_KEY, strlen (GF_XATTR_LOCKINFO_KEY)) != 0)) { cnt = local->call_cnt = layout->cnt; @@ -4560,6 +4560,7 @@ err: return 0; } + int dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) @@ -4750,17 +4751,17 @@ done: goto unwind; } - if (conf->readdir_optimize == _gf_true) { + if (conf->readdir_optimize == _gf_true) { if (next_subvol != local->first_up_subvol) { ret = dict_set_int32 (local->xattr, GF_READDIR_SKIP_DIRS, 1); if (ret) gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, - "Failed to set dictionary value" + "Failed to set dictionary value" ":key = %s", GF_READDIR_SKIP_DIRS ); - } else { + } else { dict_del (local->xattr, GF_READDIR_SKIP_DIRS); } @@ -4937,23 +4938,23 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, " : key = %s", conf->link_xattr_name); - if (conf->readdir_optimize == _gf_true) { + if (conf->readdir_optimize == _gf_true) { if (xvol != local->first_up_subvol) { - ret = dict_set_int32 (local->xattr, - GF_READDIR_SKIP_DIRS, 1); - if (ret) - gf_msg (this->name, + ret = dict_set_int32 (local->xattr, + GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "Failed to set " "dictionary value: " "key = %s", - GF_READDIR_SKIP_DIRS); + GF_READDIR_SKIP_DIRS); } else { dict_del (local->xattr, GF_READDIR_SKIP_DIRS); } - } + } } STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp, @@ -6040,6 +6041,58 @@ err: return 0; } +int +dht_create_tier_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, + dict_t *params) +{ + xlator_t *hot_subvol = NULL; + xlator_t *cold_subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + + local = frame->local; + + conf = this->private; + + cold_subvol = subvol; + hot_subvol = conf->subvolumes[1]; + if (conf->subvolumes[0] != cold_subvol) { + hot_subvol = conf->subvolumes[0]; + } + + /* if hot tier full, write to cold */ + if (dht_is_subvol_filled (this, hot_subvol)) { + gf_msg_debug (this->name, 0, + "creating %s on %s", loc->path, + cold_subvol->name); + + STACK_WIND (frame, dht_create_cbk, + cold_subvol, cold_subvol->fops->create, + loc, flags, mode, umask, fd, params); + } else { + local->params = dict_ref (params); + local->flags = flags; + local->mode = mode; + local->umask = umask; + local->cached_subvol = hot_subvol; + local->hashed_subvol = cold_subvol; + + gf_msg_debug (this->name, 0, + "creating %s on %s (link at %s)", loc->path, + hot_subvol->name, cold_subvol->name); + + dht_linkfile_create (frame, dht_create_linkfile_create_cbk, + this, hot_subvol, cold_subvol, loc); + + goto out; + } +out: + return 0; +} + + int dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, xlator_t *subvol, loc_t *loc, int32_t flags, @@ -6051,6 +6104,11 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, local = frame->local; + if (strcmp (this->type, "cluster/tier") == 0) + return dht_create_tier_wind_to_avail_subvol(frame, this, subvol, + loc, flags, mode, + umask, fd, params); + if (!dht_is_subvol_filled (this, subvol)) { gf_msg_debug (this->name, 0, "creating %s on %s", loc->path, @@ -6523,15 +6581,15 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ret = dht_layout_merge (this, layout, prev->this, -1, ENOSPC, NULL); } else { - if (op_ret == -1 && op_errno == EEXIST) { - /* Very likely just a race between mkdir and - self-heal (from lookup of a concurrent mkdir - attempt). - Ignore error for now. layout setting will - anyways fail if this was a different (old) - pre-existing different directory. - */ - op_ret = 0; + if (op_ret == -1 && op_errno == EEXIST) { + /* Very likely just a race between mkdir and + self-heal (from lookup of a concurrent mkdir + attempt). + Ignore error for now. layout setting will + anyways fail if this was a different (old) + pre-existing different directory. + */ + op_ret = 0; dir_exists = _gf_true; } ret = dht_layout_merge (this, layout, prev->this, @@ -8046,3 +8104,4 @@ int32_t dht_set_local_rebalance (xlator_t *this, dht_local_t *local, return 0; } + diff --git a/xlators/cluster/dht/src/tier-common.c b/xlators/cluster/dht/src/tier-common.c new file mode 100644 index 00000000000..7d05c2973f2 --- /dev/null +++ b/xlators/cluster/dht/src/tier-common.c @@ -0,0 +1,337 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "glusterfs.h" +#include "xlator.h" +#include "libxlator.h" +#include "dht-common.h" +#include "defaults.h" +#include "tier-common.h" +#include "tier.h" + +int +tier_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) +{ + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *next_subvol = NULL; + off_t next_offset = 0; + int count = 0; + + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; + + if (op_ret < 0) + goto done; + + list_for_each_entry (orig_entry, (&orig_entries->list), list) { + next_offset = orig_entry->d_off; + + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_NO_MEMORY, + "Memory allocation failed "); + goto unwind; + } + + entry->d_off = orig_entry->d_off; + entry->d_ino = orig_entry->d_ino; + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + list_add_tail (&entry->list, &entries.list); + count++; + } + op_ret = count; + +done: + if (count == 0) { + /* non-zero next_offset means that + EOF is not yet hit on the current subvol + */ + if (next_offset != 0) { + next_subvol = prev->this; + } else { + goto unwind; + } + + STACK_WIND (frame, tier_readdir_cbk, + next_subvol, next_subvol->fops->readdir, + local->fd, local->size, next_offset, NULL); + return 0; + } + +unwind: + if (op_ret < 0) + op_ret = 0; + + DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free (&entries); + + return 0; +} + +int +tier_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) +{ + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *next_subvol = NULL; + off_t next_offset = 0; + int count = 0; + dht_conf_t *conf = NULL; + int ret = 0; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; + itable = local->fd ? local->fd->inode->table : NULL; + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, unwind); + + if (op_ret < 0) + goto done; + + list_for_each_entry (orig_entry, (&orig_entries->list), list) { + next_offset = orig_entry->d_off; + + if (IA_ISINVAL(orig_entry->d_stat.ia_type)) { + /*stat failed somewhere- ignore this entry*/ + continue; + } + + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { + + goto unwind; + } + + entry->d_off = orig_entry->d_off; + entry->d_stat = orig_entry->d_stat; + entry->d_ino = orig_entry->d_ino; + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + if (orig_entry->dict) + entry->dict = dict_ref (orig_entry->dict); + + if (check_is_linkfile (NULL, (&orig_entry->d_stat), + orig_entry->dict, + conf->link_xattr_name)) { + inode = inode_find (itable, + orig_entry->d_stat.ia_gfid); + if (inode) { + ret = dht_layout_preset + (this, TIER_UNHASHED_SUBVOL, + inode); + if (ret) + gf_msg (this->name, + GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SET_FAILED, + "failed to link the layout" + " in inode"); + inode_unref (inode); + inode = NULL; + } + + } else if (IA_ISDIR(entry->d_stat.ia_type)) { + if (orig_entry->inode) { + dht_inode_ctx_time_update (orig_entry->inode, + this, &entry->d_stat, + 1); + } + } else { + if (orig_entry->inode) { + ret = dht_layout_preset (this, prev->this, + orig_entry->inode); + if (ret) + gf_msg (this->name, GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SET_FAILED, + "failed to link the layout " + "in inode"); + + entry->inode = inode_ref (orig_entry->inode); + } else if (itable) { + /* + * orig_entry->inode might be null if any upper + * layer xlators below client set to null, to + * force a lookup on the inode even if the inode + * is present in the inode table. In that case + * we just update the ctx to make sure we didn't + * missed anything. + */ + inode = inode_find (itable, + orig_entry->d_stat.ia_gfid); + if (inode) { + ret = dht_layout_preset + (this, TIER_HASHED_SUBVOL, + inode); + if (ret) + gf_msg (this->name, + GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SET_FAILED, + "failed to link the layout" + " in inode"); + inode_unref (inode); + inode = NULL; + } + } + } + list_add_tail (&entry->list, &entries.list); + count++; + } + op_ret = count; + +done: + if (count == 0) { + /* non-zero next_offset means that + EOF is not yet hit on the current subvol + */ + if (next_offset != 0) { + next_subvol = prev->this; + } else { + goto unwind; + } + + STACK_WIND (frame, tier_readdirp_cbk, + next_subvol, next_subvol->fops->readdirp, + local->fd, local->size, next_offset, + local->xattr); + return 0; + } + +unwind: + if (op_ret < 0) + op_ret = 0; + + DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free (&entries); + + return 0; +} + +int +tier_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, int whichop, dict_t *dict) +{ + dht_local_t *local = NULL; + int op_errno = -1; + xlator_t *hashed_subvol = NULL; + int ret = 0; + dht_conf_t *conf = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; + + local = dht_local_init (frame, NULL, NULL, whichop); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_ref (fd); + local->size = size; + local->xattr_req = (dict) ? dict_ref (dict) : NULL; + + hashed_subvol = TIER_HASHED_SUBVOL; + + + /* TODO: do proper readdir */ + if (whichop == GF_FOP_READDIRP) { + if (dict) + local->xattr = dict_ref (dict); + else + local->xattr = dict_new (); + + if (local->xattr) { + ret = dict_set_uint32 (local->xattr, + conf->link_xattr_name, 256); + if (ret) + gf_msg (this->name, GF_LOG_WARNING, 0, + DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value" + " : key = %s", + conf->link_xattr_name); + + } + + STACK_WIND (frame, tier_readdirp_cbk, hashed_subvol, + hashed_subvol->fops->readdirp, + fd, size, yoff, local->xattr); + + } else { + STACK_WIND (frame, tier_readdir_cbk, hashed_subvol, + hashed_subvol->fops->readdir, + fd, size, yoff, local->xattr); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int +tier_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *xdata) +{ + int op = GF_FOP_READDIR; + dht_conf_t *conf = NULL; + int i = 0; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + op = GF_FOP_READDIRP; + break; + } + } + + if (conf->use_readdirp) + op = GF_FOP_READDIRP; + +out: + tier_do_readdir (frame, this, fd, size, yoff, op, 0); + return 0; +} + +int +tier_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *dict) +{ + tier_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); + return 0; +} diff --git a/xlators/cluster/dht/src/tier-common.h b/xlators/cluster/dht/src/tier-common.h new file mode 100644 index 00000000000..93a71f75252 --- /dev/null +++ b/xlators/cluster/dht/src/tier-common.h @@ -0,0 +1,26 @@ +/* + Copyright (c) 2015 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _TIER_COMMON_H_ +#define _TIER_COMMON_H_ + +int32_t +tier_readdirp (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, off_t off, dict_t *dict); + +int +tier_readdir (call_frame_t *frame, + xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *xdata); + +#endif + diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index 6a9bddfc179..dbb34c67620 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -12,6 +12,7 @@ #include "dht-common.h" #include "tier.h" +#include "tier-common.h" #include "syscall.h" /*Hard coded DB info*/ @@ -442,7 +443,7 @@ tier_migrate_using_query_file (void *_args) goto abort; } - gf_msg_trace (this->name, 0, + gf_msg_debug (this->name, 0, "Tier %d" " src_subvol %s file %s", query_cbk_args->is_promotion, @@ -1557,53 +1558,16 @@ xlator_t * tier_search (xlator_t *this, dht_layout_t *layout, const char *name) { xlator_t *subvol = NULL; - void *value; - int search_subvol = 0; dht_conf_t *conf = NULL; - gf_defrag_info_t *defrag = NULL; - int layout_cold = 0; - int layout_hot = 1; GF_VALIDATE_OR_GOTO ("tier", this, out); - GF_VALIDATE_OR_GOTO (this->name, layout, out); - GF_VALIDATE_OR_GOTO (this->name, name, out); GF_VALIDATE_OR_GOTO (this->name, this->private, out); conf = this->private; - /* The first subvolume in the graph is always cold. */ - /* Find the position of the cold subvolume in the layout. */ - layout_cold = 0; - layout_hot = 1; - if (conf->subvolumes[0] != layout->list[0].xlator) { - layout_cold = 1; - layout_hot = 0; - } - - search_subvol = layout_hot; - - defrag = conf->defrag; - if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER) - search_subvol = layout_cold; - - /* "decommission_subvols_cnt" can only be non-zero on detach. */ - /* This will change once brick add/remove is supported for */ - /* tiered volumes. */ - else if (conf->decommission_subvols_cnt) { - search_subvol = layout_cold; - } - else if (!dict_get_ptr (this->options, "rule", &value) && - !strcmp(layout->list[layout_cold].xlator->name, value)) { - search_subvol = layout_cold; - } + subvol = TIER_HASHED_SUBVOL; - if ((layout->list[search_subvol].err > 0) && - (layout->list[search_subvol].err != ENOTCONN)) - search_subvol = layout_cold; - - subvol = layout->list[search_subvol].xlator; out: - return subvol; } @@ -1970,8 +1934,8 @@ struct xlator_fops fops = { .open = dht_open, .statfs = dht_statfs, .opendir = dht_opendir, - .readdir = dht_readdir, - .readdirp = dht_readdirp, + .readdir = tier_readdir, + .readdirp = tier_readdirp, .fsyncdir = dht_fsyncdir, .symlink = dht_symlink, .unlink = dht_unlink, diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h index 2a72ae2caf6..92e2fda6e5c 100644 --- a/xlators/cluster/dht/src/tier.h +++ b/xlators/cluster/dht/src/tier.h @@ -33,7 +33,8 @@ #define PROMOTION_QFILE "promotequeryfile" #define DEMOTION_QFILE "demotequeryfile" -#define TIER_HASHED_SUBVOL conf->subvolumes[1] +#define TIER_HASHED_SUBVOL conf->subvolumes[0] +#define TIER_UNHASHED_SUBVOL conf->subvolumes[1] #define GET_QFILE_PATH(is_promotion)\ (is_promotion) ? promotion_qfile : demotion_qfile -- cgit