summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libglusterfs/src/dict.c26
-rw-r--r--libglusterfs/src/dict.h1
-rw-r--r--libglusterfs/src/gfdb/gfdb_data_store.h1
-rw-r--r--libglusterfs/src/gfdb/gfdb_sqlite3.h1
-rwxr-xr-xtests/basic/tier/tier.t116
-rw-r--r--xlators/cluster/dht/src/Makefile.am11
-rw-r--r--xlators/cluster/dht/src/dht-common.c100
-rw-r--r--xlators/cluster/dht/src/dht-common.h29
-rw-r--r--xlators/cluster/dht/src/dht-helper.c22
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c1
-rw-r--r--xlators/cluster/dht/src/dht-messages.h19
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c28
-rw-r--r--xlators/cluster/dht/src/dht-shared.c31
-rw-r--r--xlators/cluster/dht/src/dht.c7
-rw-r--r--xlators/cluster/dht/src/nufa.c5
-rw-r--r--xlators/cluster/dht/src/tier.c1007
-rw-r--r--xlators/cluster/dht/src/tier.h71
17 files changed, 1443 insertions, 33 deletions
diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c
index 31867588def..81db64dfd40 100644
--- a/libglusterfs/src/dict.c
+++ b/libglusterfs/src/dict.c
@@ -2156,6 +2156,32 @@ err:
return ret;
}
+int
+dict_add_dynstr_with_alloc (dict_t *this, char *key, char *str)
+{
+ data_t *data = NULL;
+ int ret = 0;
+ char *alloc_str = NULL;
+
+ alloc_str = gf_strdup (str);
+ if (!alloc_str)
+ goto out;
+
+ data = data_from_dynstr (alloc_str);
+ if (!data) {
+ GF_FREE (alloc_str);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = dict_add (this, key, data);
+ if (ret < 0)
+ data_destroy (data);
+
+out:
+ return ret;
+}
+
/*
for malloced strings we should do a free instead of GF_FREE
*/
diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h
index a4c1815d06a..a1a4c85f711 100644
--- a/libglusterfs/src/dict.h
+++ b/libglusterfs/src/dict.h
@@ -244,6 +244,7 @@ GF_MUST_CHECK int dict_set_str (dict_t *this, char *key, char *str);
GF_MUST_CHECK int dict_set_dynmstr (dict_t *this, char *key, char *str);
GF_MUST_CHECK int dict_set_dynstr (dict_t *this, char *key, char *str);
GF_MUST_CHECK int dict_set_dynstr_with_alloc (dict_t *this, char *key, const char *str);
+GF_MUST_CHECK int dict_add_dynstr_with_alloc (dict_t *this, char *key, char *str);
GF_MUST_CHECK int dict_get_str (dict_t *this, char *key, char **str);
GF_MUST_CHECK int dict_get_str_boolean (dict_t *this, char *key, int default_val);
diff --git a/libglusterfs/src/gfdb/gfdb_data_store.h b/libglusterfs/src/gfdb/gfdb_data_store.h
index 1212c2b3fe1..ffe590b84bd 100644
--- a/libglusterfs/src/gfdb/gfdb_data_store.h
+++ b/libglusterfs/src/gfdb/gfdb_data_store.h
@@ -26,7 +26,6 @@
#include "gfdb_data_store_types.h"
#include "gfdb_sqlite3.h"
-
/* GFDB Connection Node:
* ~~~~~~~~~~~~~~~~~~~~
* Represents the connection to the database while using libgfdb
diff --git a/libglusterfs/src/gfdb/gfdb_sqlite3.h b/libglusterfs/src/gfdb/gfdb_sqlite3.h
index b0c4b2b5fdd..04bfde7fa38 100644
--- a/libglusterfs/src/gfdb/gfdb_sqlite3.h
+++ b/libglusterfs/src/gfdb/gfdb_sqlite3.h
@@ -143,7 +143,6 @@ do {\
};\
} while (0)
-
#define GF_SQLITE3_SET_PRAGMA(sqlite3_config_str, param_key, format, value,\
ret, error)\
do {\
diff --git a/tests/basic/tier/tier.t b/tests/basic/tier/tier.t
new file mode 100755
index 00000000000..6bd6fdf8849
--- /dev/null
+++ b/tests/basic/tier/tier.t
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+function file_on_slow_tier {
+ s=$(md5sum $1)
+ for i in `seq 0 $LAST_BRICK`; do
+ test -e $B0/${V0}${i}/$1 && break;
+ done
+ if [ $? -eq 0 ] && ! [ "`md5sum $B0/${V0}${i}/$1`" == "$s" ]; then
+ echo "0"
+ else
+ echo "1"
+ fi
+}
+
+function file_on_fast_tier {
+ local ret="1"
+
+ s1=$(md5sum $1)
+ s2=$(md5sum $B0/${V0}${CACHE_BRICK}/$1)
+ if [ -e $B0/${V0}${CACHE_BRICK}/$1 ] && ! [ "$s1" == "$s2" ]; then
+ echo "0"
+ else
+ echo "1"
+ fi
+}
+
+function confirm_tier_removed {
+ $CLI system getspec $V0 | grep $1
+ if [ $? == 0 ] ; then
+ echo "1"
+ else
+ echo "0"
+ fi
+}
+
+LAST_BRICK=1
+CACHE_BRICK=2
+DEMOTE_TIMEOUT=12
+PROMOTE_TIMEOUT=5
+cleanup
+
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0..$LAST_BRICK}
+TEST $CLI volume attach-tier $V0 $H0:$B0/${V0}${CACHE_BRICK}
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 features.ctr-enabled on
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+
+# Basic operations.
+cd $M0
+TEST stat .
+TEST mkdir d1
+TEST [ -d d1 ]
+TEST touch d1/file1
+TEST mkdir d1/d2
+TEST [ -d d1/d2 ]
+TEST find d1
+
+# Create a file. It should be on the fast tier.
+uuidgen > d1/data.txt
+TEST file_on_fast_tier d1/data.txt
+
+# Check manual demotion.
+#TEST setfattr -n trusted.distribute.migrate-data d1/data.txt
+#TEST file_on_slow_tier d1/data.txt
+
+TEST $CLI volume set $V0 cluster.tier-demote-frequency 4
+TEST $CLI volume set $V0 cluster.tier-promote-frequency 4
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume rebalance $V0 tier start
+uuidgen > d1/data2.txt
+uuidgen > d1/data3.txt
+EXPECT "0" file_on_fast_tier d1/data2.txt
+EXPECT "0" file_on_fast_tier d1/data3.txt
+
+# Check auto-demotion on write new.
+EXPECT_WITHIN $DEMOTE_TIMEOUT "0" file_on_slow_tier d1/data2.txt
+EXPECT_WITHIN $DEMOTE_TIMEOUT "0" file_on_slow_tier d1/data3.txt
+sleep 12
+# Check auto-promotion on write append.
+uuidgen >> d1/data2.txt
+
+# Check promotion on read to slow tier
+echo 3 > /proc/sys/vm/drop_caches
+cat d1/data3.txt
+sleep 5
+EXPECT_WITHIN $PROMOTE_TIMEOUT "0" file_on_fast_tier d1/data2.txt
+EXPECT_WITHIN $PROMOTE_TIMEOUT "0" file_on_fast_tier d1/data3.txt
+
+# Test rebalance commands
+TEST $CLI volume rebalance $V0 tier status
+TEST $CLI volume rebalance $V0 stop
+
+# stop gluster, when it comes back info file should have tiered volume
+killall glusterd
+TEST glusterd
+
+# TBD: Remove force. Gracefully migrate data off hot tier.
+# Rebalance+promotion/demotion is under construction.
+
+TEST $CLI volume detach-tier $V0
+
+# temporarily comment out
+#TEST ! [ -e $M0/d1/data.txt ]
+
+EXPECT "0" confirm_tier_removed ${V0}${CACHE_BRICK}
+
+TEST $CLI volume stop $V0
+
+cleanup
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index 8d02749f4d9..46dc4bb840f 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,4 +1,4 @@
-xlator_LTLIBRARIES = dht.la nufa.la switch.la
+xlator_LTLIBRARIES = dht.la nufa.la switch.la tier.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \
@@ -10,6 +10,7 @@ dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
+tier_la_SOURCES = $(dht_common_source) tier.c
dht_la_LDFLAGS = -module -avoid-version
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
@@ -20,10 +21,16 @@ nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
switch_la_LDFLAGS = -module -avoid-version
switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h \
+tier_la_LDFLAGS = -module -avoid-version
+tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
+ $(top_builddir)/libglusterfs/src/gfdb/libgfdb.la
+AM_CFLAGS = -Wall $(GF_CFLAGS) $(SQLITE_CFLAGS)
+
+noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier.h\
$(top_builddir)/xlators/lib/src/libxlator.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/libglusterfs/src/gfdb \
-I$(top_srcdir)/xlators/lib/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 9fda4aa07d6..3a196e07be9 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -3175,6 +3175,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
dht_local_t *local = NULL;
dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
dht_layout_t *layout = NULL;
int i = 0;
int op_errno = EINVAL;
@@ -3191,6 +3192,10 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (loc->inode, err);
conf = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, conf, err);
+
+ methods = conf->methods;
+ GF_VALIDATE_OR_GOTO (this->name, conf->methods, err);
GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
op_errno, err);
@@ -3255,8 +3260,8 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
goto err;
}
- local->rebalance.target_node =
- dht_subvol_get_hashed (this, &local->loc);
+ methods->migration_get_dst_subvol(this, local);
+
if (!local->rebalance.target_node) {
gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_HASHED_SUBVOL_GET_FAILED,
@@ -3719,7 +3724,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -3730,7 +3734,7 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
goto err;
}
- if (IA_ISDIR (loc->inode->ia_type)) {
+ if (!loc->inode || IA_ISDIR (loc->inode->ia_type)) {
local->call_cnt = conf->subvolume_cnt;
for (i = 0; i < conf->subvolume_cnt; i++) {
@@ -3820,6 +3824,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
int count = 0;
dht_layout_t *layout = 0;
dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
xlator_t *subvol = 0;
xlator_t *hashed_subvol = 0;
int ret = 0;
@@ -3828,7 +3833,12 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
INIT_LIST_HEAD (&entries.list);
prev = cookie;
local = frame->local;
+
conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
+
+ methods = conf->methods;
+ GF_VALIDATE_OR_GOTO(this->name, conf->methods, done);
if (op_ret < 0)
goto done;
@@ -3867,8 +3877,8 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
}
- hashed_subvol = dht_layout_search (this, layout, \
- orig_entry->d_name);
+ hashed_subvol = methods->layout_search (this, layout,
+ orig_entry->d_name);
if (prev->this == hashed_subvol)
goto list;
@@ -3894,8 +3904,8 @@ list:
/* Do this if conf->search_unhashed is set to "auto" */
if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
- subvol = dht_layout_search (this, layout,
- orig_entry->d_name);
+ subvol = methods->layout_search (this, layout,
+ orig_entry->d_name);
if (!subvol || (subvol != prev->this)) {
/* TODO: Count the number of entries which need
linkfile to prove its existence in fs */
@@ -4008,11 +4018,19 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int count = 0;
dht_layout_t *layout = 0;
xlator_t *subvol = 0;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
INIT_LIST_HEAD (&entries.list);
prev = cookie;
local = frame->local;
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, conf, done);
+
+ methods = conf->methods;
+ GF_VALIDATE_OR_GOTO (this->name, conf->methods, done);
+
if (op_ret < 0)
goto done;
@@ -4024,7 +4042,8 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
list_for_each_entry (orig_entry, (&orig_entries->list), list) {
next_offset = orig_entry->d_off;
- subvol = dht_layout_search (this, layout, orig_entry->d_name);
+ subvol = methods->layout_search (this, layout,
+ orig_entry->d_name);
if (!subvol || (subvol == prev->this)) {
entry = gf_dirent_for_name (orig_entry->d_name);
@@ -6073,11 +6092,13 @@ dht_notify (xlator_t *this, int event, void *data, ...)
gf_defrag_type cmd = 0;
dict_t *output = NULL;
va_list ap;
-
+ dht_methods_t *methods = NULL;
conf = this->private;
- if (!conf)
- return ret;
+ GF_VALIDATE_OR_GOTO (this->name, conf, out);
+
+ methods = conf->methods;
+ GF_VALIDATE_OR_GOTO (this->name, methods, out);
/* had all subvolumes reported status once till now? */
had_heard_from_all = 1;
@@ -6271,12 +6292,18 @@ unlock:
* not need to handle CHILD_DOWN event here.
*/
if (conf->defrag) {
- ret = gf_thread_create (&conf->defrag->th, NULL,
- gf_defrag_start, this);
- if (ret) {
- conf->defrag = NULL;
+ if (methods->migration_needed(this)) {
+ ret = gf_thread_create(&conf->defrag->th,
+ NULL,
+ gf_defrag_start, this);
+ if (ret) {
+ GF_FREE (conf->defrag);
+ conf->defrag = NULL;
+ kill (getpid(), SIGTERM);
+ }
+ } else {
GF_FREE (conf->defrag);
- kill (getpid(), SIGTERM);
+ conf->defrag = NULL;
}
}
}
@@ -6284,7 +6311,7 @@ unlock:
ret = 0;
if (propagate)
ret = default_notify (this, event, data);
-
+out:
return ret;
}
@@ -6412,3 +6439,40 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
err:
GF_FREE (output_string);
}
+
+int32_t dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local)
+{
+ int ret = -1;
+
+ if (!local)
+ goto out;
+
+ local->rebalance.target_node =
+ dht_subvol_get_hashed (this, &local->loc);
+
+ if (local->rebalance.target_node)
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t dht_migration_needed(xlator_t *this)
+{
+ gf_defrag_info_t *defrag = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ conf = this->private;
+
+ GF_VALIDATE_OR_GOTO ("dht", conf, out);
+ GF_VALIDATE_OR_GOTO ("dht", conf->defrag, out);
+
+ defrag = conf->defrag;
+
+ if (defrag->cmd != GF_DEFRAG_CMD_START_TIER)
+ ret = 1;
+
+out:
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 67e693146af..9145f336d7c 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -267,6 +267,8 @@ enum gf_defrag_type {
GF_DEFRAG_CMD_STATUS = 1 + 2,
GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3,
GF_DEFRAG_CMD_START_FORCE = 1 + 4,
+ GF_DEFRAG_CMD_START_TIER = 1 + 5,
+ GF_DEFRAG_CMD_STATUS_TIER = 1 + 6,
};
typedef enum gf_defrag_type gf_defrag_type;
@@ -310,10 +312,31 @@ struct gf_defrag_info_ {
struct timeval start_time;
gf_boolean_t stats;
gf_defrag_pattern_list_t *defrag_pattern;
+ int tier_promote_frequency;
+ int tier_demote_frequency;
+
+ /*Data Tiering params for scanner*/
+ uint64_t total_files_promoted;
+ uint64_t total_files_demoted;
+ int write_freq_threshold;
+ int read_freq_threshold;
};
typedef struct gf_defrag_info_ gf_defrag_info_t;
+struct dht_methods_s {
+ int32_t (*migration_get_dst_subvol)(xlator_t *this,
+ dht_local_t *local);
+ int32_t (*migration_other)(xlator_t *this,
+ gf_defrag_info_t *defrag);
+ int32_t (*migration_needed)(xlator_t *this);
+ xlator_t* (*layout_search)(xlator_t *this,
+ dht_layout_t *layout,
+ const char *name);
+};
+
+typedef struct dht_methods_s dht_methods_t;
+
struct dht_conf {
gf_lock_t subvolume_lock;
int subvolume_cnt;
@@ -371,6 +394,8 @@ struct dht_conf {
gf_boolean_t do_weighting;
gf_boolean_t randomize_by_gfid;
+ dht_methods_t *methods;
+
struct mem_pool *lock_pool;
};
typedef struct dht_conf dht_conf_t;
@@ -477,6 +502,10 @@ dht_layout_t *dht_layout_get (xlator_t *this, inode_t
dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol);
xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout,
const char *name);
+int32_t
+dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local);
+int32_t
+dht_migration_needed(xlator_t *this);
int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout);
int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
uint32_t *holes_p, uint32_t *overlaps_p,
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index f4e5305d791..346d19bec88 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -502,10 +502,18 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc)
{
dht_layout_t *layout = NULL;
xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
GF_VALIDATE_OR_GOTO ("dht", this, out);
GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, conf, out);
+
+ methods = conf->methods;
+ GF_VALIDATE_OR_GOTO (this->name, conf->methods, out);
+
if (__is_root_gfid (loc->gfid)) {
subvol = dht_first_up_subvol (this);
goto out;
@@ -523,7 +531,7 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc)
goto out;
}
- subvol = dht_layout_search (this, layout, loc->name);
+ subvol = methods->layout_search (this, layout, loc->name);
if (!subvol) {
gf_msg_debug (this->name, 0,
@@ -846,6 +854,18 @@ dht_migration_complete_check_task (void *data)
SYNCTASK_SETID (frame->root->uid, frame->root->gid);
}
+ /*
+ * temporary check related to tier promoting/demoting the file;
+ * the lower level DHT detects the migration (due to sticky
+ * bits) when it is the responsibility of the tier translator
+ * to complete the rebalance transaction. It will be corrected
+ * when rebalance and tier migration are fixed to work together.
+ */
+ if (strcmp(this->parents->xlator->type, "cluster/tier") == 0) {
+ ret = 0;
+ goto out;
+ }
+
if (!ret)
dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index d78dd2ea0ef..78e3ef4233b 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -1064,7 +1064,6 @@ dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_INODELK);
if (!local) {
diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h
index f4096ae8f1e..eb4c356e3c2 100644
--- a/xlators/cluster/dht/src/dht-messages.h
+++ b/xlators/cluster/dht/src/dht-messages.h
@@ -441,12 +441,25 @@
#define DHT_MSG_LOG_FIXED_LAYOUT (GLFS_DHT_BASE + 36)
-/*------------*/
-#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
+/*
+ * @messageid 109037
+ * @diagnosis Informational message regarding error in tier operation
+ * @recommendedaction None
+ */
+#define DHT_MSG_LOG_TIER_ERROR (GLFS_DHT_BASE + 37)
-#endif /* _DHT_MESSAGES_H_ */
+/*
+ * @messageid 109038
+ * @diagnosis Informational message regarding tier operation
+ * @recommendedaction None
+ */
+#define DHT_MSG_LOG_TIER_STATUS (GLFS_DHT_BASE + 38)
+/*------------*/
+#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
+
+#endif /* _DHT_MESSAGES_H_ */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 3531872dd31..b838ecec4b7 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -19,6 +19,7 @@
#include <signal.h>
#include <fnmatch.h>
#include <signal.h>
+#include "tier.h"
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
@@ -919,6 +920,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
tmp_loc.inode = inode_ref (loc->inode);
uuid_copy (tmp_loc.gfid, loc->gfid);
+ tmp_loc.path = gf_strdup(loc->path);
ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW,
&flock, NULL, NULL);
@@ -984,6 +986,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
ret = __dht_check_free_space (to, from, loc, &stbuf, flag);
+
if (ret) {
goto out;
}
@@ -1713,7 +1716,8 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
goto out;
}
- if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+ if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
+ (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
ret = gf_defrag_migrate_data (this, defrag, loc, migrate_data);
if (ret)
goto out;
@@ -1863,7 +1867,6 @@ out:
}
-
int
gf_defrag_start_crawl (void *data)
{
@@ -1878,6 +1881,7 @@ gf_defrag_start_crawl (void *data)
dict_t *migrate_data = NULL;
dict_t *status = NULL;
glusterfs_ctx_t *ctx = NULL;
+ dht_methods_t *methods = NULL;
this = data;
if (!this)
@@ -1942,7 +1946,8 @@ gf_defrag_start_crawl (void *data)
goto out;
}
- if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+ if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
+ (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
migrate_data = dict_new ();
if (!migrate_data) {
ret = -1;
@@ -1959,15 +1964,28 @@ gf_defrag_start_crawl (void *data)
if (ret)
goto out;
}
+
ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout,
migrate_data);
+
+ if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
+ methods = conf->methods;
+ if (!methods) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Methods invalid for translator.");
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ }
+ methods->migration_other(this, defrag);
+ }
+
if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) &&
(defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) {
defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
}
-
-
out:
LOCK (&defrag->lock);
{
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 860f3e716f0..1e666bd8140 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -29,6 +29,8 @@
*/
struct volume_options options[];
+extern dht_methods_t dht_methods;
+
void
dht_layout_dump (dht_layout_t *layout, const char *prefix)
{
@@ -701,6 +703,8 @@ dht_init (xlator_t *this)
if (dht_set_subvol_range(this))
goto err;
+ conf->methods = &dht_methods;
+
return 0;
err:
@@ -847,6 +851,33 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_XLATOR
},
+ /* tier options */
+ { .key = {"tier-promote-frequency"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "120",
+ .description = "Frequency to promote files to fast tier"
+ },
+
+ { .key = {"tier-demote-frequency"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "120",
+ .description = "Frequency to demote files to slow tier"
+ },
+
+ { .key = {"write-freq-thresold"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "0",
+ .description = "Defines the write fequency "
+ "that would be considered hot"
+ },
+
+ { .key = {"read-freq-thresold"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "0",
+ .description = "Defines the read fequency "
+ "that would be considered hot"
+ },
+
/* switch option */
{ .key = {"pattern.switch.case"},
.type = GF_OPTION_TYPE_ANY
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index fc0ca2f7735..3934df5ec64 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -17,6 +17,12 @@
#include "statedump.h"
#include "dht-common.h"
+dht_methods_t dht_methods = {
+ .migration_get_dst_subvol = dht_migration_get_dst_subvol,
+ .migration_needed = dht_migration_needed,
+ .layout_search = dht_layout_search,
+};
+
class_methods_t class_methods = {
.init = dht_init,
.fini = dht_fini,
@@ -86,4 +92,3 @@ struct xlator_cbks cbks = {
// .releasedir = dht_releasedir,
.forget = dht_forget
};
-;
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index f188a5479f4..72d6d9c10e5 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -621,6 +621,11 @@ nufa_init (xlator_t *this)
return 0;
}
+dht_methods_t dht_methods = {
+ .migration_get_dst_subvol = dht_migration_get_dst_subvol,
+ .migration_needed = dht_migration_needed,
+ .layout_search = dht_layout_search,
+};
class_methods_t class_methods = {
.init = nufa_init,
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
new file mode 100644
index 00000000000..028a42f7a1a
--- /dev/null
+++ b/xlators/cluster/dht/src/tier.c
@@ -0,0 +1,1007 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "dht-common.h"
+#include "tier.h"
+
+/*Hard coded DB info*/
+static gfdb_db_type_t dht_tier_db_type = GFDB_SQLITE3;
+/*Hard coded DB info*/
+
+/*Mutex for updating the data movement stats*/
+static pthread_mutex_t dm_stat_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+#define DB_QUERY_RECORD_SIZE 4096
+
+static int
+tier_parse_query_str (char *query_record_str,
+ char *gfid, char *link_buffer, ssize_t *link_size)
+{
+ char *token_str = NULL;
+ char *delimiter = "|";
+ char *saveptr = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("tier", query_record_str, out);
+ GF_VALIDATE_OR_GOTO ("tier", gfid, out);
+ GF_VALIDATE_OR_GOTO ("tier", link_buffer, out);
+ GF_VALIDATE_OR_GOTO ("tier", link_size, out);
+
+ token_str = strtok_r (query_record_str, delimiter, &saveptr);
+ if (!token_str)
+ goto out;
+
+ strcpy (gfid, token_str);
+
+
+ token_str = strtok_r (NULL, delimiter, &saveptr);
+ if (!token_str)
+ goto out;
+
+ strcpy (link_buffer, token_str);
+
+ token_str = strtok_r (NULL, delimiter, &saveptr);
+ if (!token_str)
+ goto out;
+
+ *link_size = atoi (token_str);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int
+tier_migrate_using_query_file (void *_args)
+{
+ int ret = -1;
+ char gfid_str[UUID_CANONICAL_FORM_LEN+1] = "";
+ char query_record_str[4096] = "";
+ query_cbk_args_t *query_cbk_args = (query_cbk_args_t *) _args;
+ xlator_t *this = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ char *token_str = NULL;
+ char *delimiter = "::";
+ char *link_buffer = NULL;
+ gfdb_query_record_t *query_record = NULL;
+ gfdb_link_info_t *link_info = NULL;
+ struct iatt par_stbuf = {0,};
+ struct iatt current = {0,};
+ loc_t p_loc = {0,};
+ loc_t loc = {0,};
+ dict_t *migrate_data = NULL;
+ inode_t *linked_inode = NULL;
+ int per_file_status = 0;
+ int per_link_status = 0;
+ int total_status = 0;
+ FILE *queryFILE = NULL;
+ char *link_str = NULL;
+
+ GF_VALIDATE_OR_GOTO ("tier", query_cbk_args, out);
+ GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out);
+ this = query_cbk_args->this;
+ GF_VALIDATE_OR_GOTO (this->name, query_cbk_args->defrag, out);
+ GF_VALIDATE_OR_GOTO (this->name, query_cbk_args->queryFILE, out);
+
+ defrag = query_cbk_args->defrag;
+
+ queryFILE = query_cbk_args->queryFILE;
+
+ query_record = gfdb_query_record_init();
+ if (!query_record) {
+ goto out;
+ }
+
+ query_record->_link_info_str = calloc (DB_QUERY_RECORD_SIZE, 1);
+ if (!query_record->_link_info_str) {
+ goto out;
+ }
+ link_buffer = query_record->_link_info_str;
+
+ link_info = gfdb_link_info_init ();
+
+ migrate_data = dict_new ();
+ if (!migrate_data)
+ goto out;
+
+ /* Per file */
+ while (fscanf (queryFILE, "%s", query_record_str) != EOF) {
+
+ per_file_status = 0;
+ per_link_status = 0;
+
+ memset (gfid_str, 0, UUID_CANONICAL_FORM_LEN+1);
+ memset (query_record->_link_info_str, 0, DB_QUERY_RECORD_SIZE);
+
+ if (tier_parse_query_str (query_record_str, gfid_str,
+ link_buffer,
+ &query_record->link_info_size)) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "failed parsing %s\n", query_record_str);
+ continue;
+ }
+
+ uuid_parse (gfid_str, query_record->gfid);
+
+ if (dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY))
+ dict_del(migrate_data, GF_XATTR_FILE_MIGRATE_KEY);
+
+ if (dict_get(migrate_data, "from.migrator"))
+ dict_del(migrate_data, "from.migrator");
+
+ token_str = strtok (link_buffer, delimiter);
+ if (token_str != NULL) {
+ per_file_status =
+ dict_set_str (migrate_data,
+ GF_XATTR_FILE_MIGRATE_KEY,
+ "force");
+ if (per_file_status) {
+ goto per_file_out;
+ }
+
+ /* Flag to suggest the xattr call is from migrator */
+ per_file_status = dict_set_str (migrate_data,
+ "from.migrator", "yes");
+ if (per_file_status) {
+ goto per_file_out;
+ }
+ }
+ per_link_status = 0;
+ /* Per link of file */
+ while (token_str != NULL) {
+
+ link_str = gf_strdup (token_str);
+
+ if (!link_info) {
+ per_link_status = -1;
+ goto per_file_out;
+ }
+
+ memset (link_info, 0, sizeof(gfdb_link_info_t));
+
+ ret = str_to_link_info (link_str, link_info);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "failed parsing %s\n", link_str);
+ per_link_status = -1;
+ goto error;
+ }
+
+ uuid_copy (p_loc.gfid, link_info->pargfid);
+
+ p_loc.inode = inode_new (defrag->root_inode->table);
+ if (!p_loc.inode) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "failed parsing %s\n", link_str);
+ per_link_status = -1;
+ goto error;
+ }
+
+ ret = syncop_lookup (this, &p_loc, NULL, &par_stbuf,
+ NULL, NULL);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ " ERROR in parent lookup\n");
+ per_link_status = -1;
+ goto error;
+ }
+ linked_inode = inode_link (p_loc.inode, NULL, NULL,
+ &par_stbuf);
+ inode_unref (p_loc.inode);
+ p_loc.inode = linked_inode;
+
+ uuid_copy (loc.gfid, query_record->gfid);
+ loc.inode = inode_new (defrag->root_inode->table);
+ uuid_copy (loc.pargfid, link_info->pargfid);
+ loc.parent = inode_ref(p_loc.inode);
+
+ loc.name = gf_strdup (link_info->file_name);
+ if (!loc.name) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR, "ERROR in "
+ "memory allocation\n");
+ per_link_status = -1;
+ goto error;
+ }
+
+ loc.path = gf_strdup (link_info->file_path);
+ if (!loc.path) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR, "ERROR in "
+ "memory allocation\n");
+ per_link_status = -1;
+ goto error;
+ }
+
+ uuid_copy (loc.parent->gfid, link_info->pargfid);
+
+ ret = syncop_lookup (this, &loc, NULL, &current,
+ NULL, NULL);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR, "ERROR in "
+ "current lookup\n");
+ per_link_status = -1;
+ goto error;
+ }
+ linked_inode = inode_link (loc.inode, NULL, NULL,
+ &current);
+ inode_unref (loc.inode);
+ loc.inode = linked_inode;
+
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LOG_TIER_STATUS, "Tier migrate file %s",
+ loc.name);
+
+ ret = syncop_setxattr (this, &loc, migrate_data, 0);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR, "ERROR %d in "
+ "current migration %s %s\n", ret,
+ loc.name,
+ loc.path);
+ per_link_status = -1;
+ goto error;
+ }
+ inode_unref (loc.inode);
+ inode_unref (loc.parent);
+ inode_unref (p_loc.inode);
+error:
+ if (loc.name) {
+ GF_FREE ((char *) loc.name);
+ }
+ if (loc.path) {
+ GF_FREE ((char *) loc.path);
+ }
+ token_str = NULL;
+ token_str = strtok (NULL, delimiter);
+ GF_FREE (link_str);
+ }
+ per_file_status = per_link_status;
+per_file_out:
+ if (per_file_status) {
+ pthread_mutex_lock (&dm_stat_mutex);
+ defrag->total_failures++;
+ pthread_mutex_unlock (&dm_stat_mutex);
+ } else {
+ pthread_mutex_lock (&dm_stat_mutex);
+ defrag->total_files++;
+ pthread_mutex_unlock (&dm_stat_mutex);
+ }
+ total_status = total_status + per_file_status;
+ per_link_status = 0;
+ per_file_status = 0;
+ query_record_str[0] = '\0';
+ }
+
+out:
+ if (link_buffer)
+ free (link_buffer);
+ gfdb_link_info_fini (&link_info);
+ if (migrate_data)
+ dict_unref (migrate_data);
+ gfdb_query_record_fini (&query_record);
+ return total_status;
+}
+
+
+/*This is the call back function per record/file from data base*/
+static int
+tier_gf_query_callback (gfdb_query_record_t *gfdb_query_record,
+ void *_args) {
+ int ret = -1;
+ char gfid_str[UUID_CANONICAL_FORM_LEN] = "";
+ query_cbk_args_t *query_cbk_args = _args;
+
+ GF_VALIDATE_OR_GOTO ("tier", query_cbk_args, out);
+ GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->defrag, out);
+ GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->queryFILE, out);
+
+ uuid_unparse (gfdb_query_record->gfid, gfid_str);
+ fprintf (query_cbk_args->queryFILE, "%s|%s|%ld\n", gfid_str,
+ gfdb_query_record->_link_info_str,
+ gfdb_query_record->link_info_size);
+
+ pthread_mutex_lock (&dm_stat_mutex);
+ query_cbk_args->defrag->num_files_lookedup++;
+ pthread_mutex_unlock (&dm_stat_mutex);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*This is the call back function for each brick from hot/cold bricklist
+ * It picks up each bricks db and queries for eligible files for migration.
+ * The list of eligible files are populated in appropriate query files*/
+static int
+tier_process_brick_cbk (dict_t *brick_dict, char *key, data_t *value,
+ void *args) {
+ int ret = -1;
+ char *db_path = NULL;
+ query_cbk_args_t *query_cbk_args = NULL;
+ xlator_t *this = NULL;
+ gfdb_conn_node_t *conn_node = NULL;
+ dict_t *params_dict = NULL;
+ _gfdb_brick_dict_info_t *gfdb_brick_dict_info = args;
+
+ /*Init of all the essentials*/
+ GF_VALIDATE_OR_GOTO ("tier", gfdb_brick_dict_info , out);
+ query_cbk_args = gfdb_brick_dict_info->_query_cbk_args;
+
+ GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out);
+ this = query_cbk_args->this;
+
+ GF_VALIDATE_OR_GOTO (this->name,
+ gfdb_brick_dict_info->_query_cbk_args, out);
+
+ GF_VALIDATE_OR_GOTO (this->name, value, out);
+ db_path = data_to_str(value);
+
+ /*Preparing DB parameters before init_db i.e getting db connection*/
+ params_dict = dict_new ();
+ if (!params_dict) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "DB Params cannot initialized!");
+ goto out;
+ }
+ SET_DB_PARAM_TO_DICT(this->name, params_dict, GFDB_SQL_PARAM_DBPATH,
+ db_path, ret, out);
+
+ /*Get the db connection*/
+ conn_node = init_db((void *)params_dict, dht_tier_db_type);
+ if (!conn_node) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "FATAL: Failed initializing db operations");
+ goto out;
+ }
+
+ /*Query for eligible files from db*/
+ query_cbk_args->queryFILE = fopen(GET_QFILE_PATH
+ (gfdb_brick_dict_info->_gfdb_promote), "a+");
+ if (!query_cbk_args->queryFILE) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to open query file %s:%s",
+ GET_QFILE_PATH
+ (gfdb_brick_dict_info->_gfdb_promote),
+ strerror(errno));
+ goto out;
+ }
+ if (!gfdb_brick_dict_info->_gfdb_promote) {
+ if (query_cbk_args->defrag->write_freq_threshold == 0 &&
+ query_cbk_args->defrag->read_freq_threshold == 0) {
+ ret = find_unchanged_for_time(conn_node,
+ tier_gf_query_callback,
+ (void *)query_cbk_args,
+ gfdb_brick_dict_info->time_stamp);
+ } else {
+ ret = find_unchanged_for_time_freq(conn_node,
+ tier_gf_query_callback,
+ (void *)query_cbk_args,
+ gfdb_brick_dict_info->time_stamp,
+ query_cbk_args->defrag->
+ write_freq_threshold,
+ query_cbk_args->defrag->
+ read_freq_threshold,
+ _gf_false);
+ }
+ } else {
+ if (query_cbk_args->defrag->write_freq_threshold == 0 &&
+ query_cbk_args->defrag->read_freq_threshold == 0) {
+ ret = find_recently_changed_files(conn_node,
+ tier_gf_query_callback,
+ (void *)query_cbk_args,
+ gfdb_brick_dict_info->time_stamp);
+ } else {
+ ret = find_recently_changed_files_freq(conn_node,
+ tier_gf_query_callback,
+ (void *)query_cbk_args,
+ gfdb_brick_dict_info->time_stamp,
+ query_cbk_args->defrag->
+ write_freq_threshold,
+ query_cbk_args->defrag->
+ read_freq_threshold,
+ _gf_false);
+ }
+ }
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "FATAL: query from db failed");
+ goto out;
+ }
+ ret = 0;
+out:
+ if (query_cbk_args->queryFILE) {
+ fclose (query_cbk_args->queryFILE);
+ query_cbk_args->queryFILE = NULL;
+ }
+ fini_db (conn_node);
+ return ret;
+}
+
+inline int
+tier_build_migration_qfile (demotion_args_t *args,
+ query_cbk_args_t *query_cbk_args,
+ gf_boolean_t is_promotion)
+{
+ gfdb_time_t current_time;
+ _gfdb_brick_dict_info_t gfdb_brick_dict_info;
+ gfdb_time_t time_in_past;
+ int ret = -1;
+
+ remove (GET_QFILE_PATH (is_promotion));
+ time_in_past.tv_sec = args->freq_time;
+ time_in_past.tv_usec = 0;
+ if (gettimeofday (&current_time, NULL) == -1) {
+ gf_log (args->this->name, GF_LOG_ERROR,
+ "Failed to get current timen");
+ goto out;
+ }
+ time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec;
+ time_in_past.tv_usec = current_time.tv_usec - time_in_past.tv_usec;
+ gfdb_brick_dict_info.time_stamp = &time_in_past;
+ gfdb_brick_dict_info._gfdb_promote = is_promotion;
+ gfdb_brick_dict_info._query_cbk_args = query_cbk_args;
+ ret = dict_foreach (args->brick_list, tier_process_brick_cbk,
+ &gfdb_brick_dict_info);
+ if (ret) {
+ gf_log (args->this->name, GF_LOG_ERROR,
+ "Brick query failedn");
+ goto out;
+ }
+out:
+ return ret;
+}
+
+inline int
+tier_migrate_files_using_qfile (demotion_args_t *comp,
+ query_cbk_args_t *query_cbk_args,
+ char *qfile)
+{
+ char renamed_file[PATH_MAX] = "";
+ int ret = -1;
+
+ query_cbk_args->queryFILE = fopen (qfile, "r");
+ if (!query_cbk_args->queryFILE) {
+ gf_log ("tier", GF_LOG_ERROR,
+ "Failed opening %s for migration", qfile);
+ goto out;
+ }
+ ret = tier_migrate_using_query_file ((void *)query_cbk_args);
+ fclose (query_cbk_args->queryFILE);
+ query_cbk_args->queryFILE = NULL;
+ if (ret) {
+ sprintf (renamed_file, "%s.err", qfile);
+ rename (qfile, renamed_file);
+ }
+out:
+ return ret;
+}
+
+/*Demotion Thread*/
+static void *
+tier_demote (void *args)
+{
+ int ret = -1;
+ query_cbk_args_t query_cbk_args;
+ demotion_args_t *demotion_args = args;
+
+ GF_VALIDATE_OR_GOTO ("tier", demotion_args, out);
+ GF_VALIDATE_OR_GOTO ("tier", demotion_args->this, out);
+ GF_VALIDATE_OR_GOTO (demotion_args->this->name,
+ demotion_args->brick_list, out);
+ GF_VALIDATE_OR_GOTO (demotion_args->this->name,
+ demotion_args->defrag, out);
+
+ query_cbk_args.this = demotion_args->this;
+ query_cbk_args.defrag = demotion_args->defrag;
+
+ /*Build the query file using bricklist*/
+ ret = tier_build_migration_qfile(demotion_args, &query_cbk_args,
+ _gf_false);
+ if (ret)
+ goto out;
+
+ /* Migrate files using the query file */
+ ret = tier_migrate_files_using_qfile (args,
+ &query_cbk_args, DEMOTION_QFILE);
+ if (ret)
+ goto out;
+
+out:
+ demotion_args->return_value = ret;
+ return NULL;
+}
+
+
+/*Promotion Thread*/
+static void
+*tier_promote (void *args)
+{
+ int ret = -1;
+ query_cbk_args_t query_cbk_args;
+ promotion_args_t *promotion_args = args;
+
+ GF_VALIDATE_OR_GOTO ("tier", promotion_args->this, out);
+ GF_VALIDATE_OR_GOTO (promotion_args->this->name,
+ promotion_args->brick_list, out);
+ GF_VALIDATE_OR_GOTO (promotion_args->this->name,
+ promotion_args->defrag, out);
+
+ query_cbk_args.this = promotion_args->this;
+ query_cbk_args.defrag = promotion_args->defrag;
+
+ /*Build the query file using bricklist*/
+ ret = tier_build_migration_qfile(promotion_args, &query_cbk_args,
+ _gf_true);
+ if (ret)
+ goto out;
+
+ /* Migrate files using the query file */
+ ret = tier_migrate_files_using_qfile (args,
+ &query_cbk_args,
+ PROMOTION_QFILE);
+ if (ret)
+ goto out;
+
+out:
+ promotion_args->return_value = ret;
+ return NULL;
+}
+
+static void
+tier_get_bricklist (xlator_t *xl, dict_t *bricklist)
+{
+ xlator_list_t *child = NULL;
+ char *rv = NULL;
+ char *rh = NULL;
+ char localhost[256] = {0};
+ char *db_path = "";
+ char *brickname = NULL;
+ char db_name[PATH_MAX] = "";
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO ("tier", xl, out);
+ GF_VALIDATE_OR_GOTO ("tier", bricklist, out);
+
+ gethostname (localhost, sizeof (localhost));
+
+ /*
+ * This function obtains remote subvolumes and filters out only
+ * those running on the same node as the tier daemon.
+ */
+ if (strcmp(xl->type, "protocol/client") == 0) {
+ ret = dict_get_str(xl->options, "remote-host", &rh);
+ if (ret < 0)
+ goto out;
+
+ if (gf_is_local_addr (rh)) {
+
+ ret = dict_get_str(xl->options, "remote-subvolume",
+ &rv);
+ if (ret < 0)
+ goto out;
+ brickname = strrchr(rv, '/') + 1;
+ snprintf(db_name, sizeof(db_name), "%s.db",
+ brickname);
+ db_path = GF_CALLOC (PATH_MAX, 1, gf_common_mt_char);
+ if (!db_path) {
+ gf_msg ("tier", GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_STATUS,
+ "Failed to allocate memory for bricklist");
+ goto out;
+ }
+
+ sprintf(db_path, "%s/.glusterfs/%s", rv, db_name);
+ if (dict_add_dynstr_with_alloc(bricklist, "brick",
+ db_path))
+ goto out;
+ }
+ }
+
+ for (child = xl->children; child; child = child->next) {
+ tier_get_bricklist(child->xlator, bricklist);
+ }
+out:
+ return;
+}
+
+int
+tier_start (xlator_t *this, gf_defrag_info_t *defrag)
+{
+ dict_t *bricklist_cold = NULL;
+ dict_t *bricklist_hot = NULL;
+ dht_conf_t *conf = NULL;
+ int tick = 0;
+ int next_demote = 0;
+ int next_promote = 0;
+ int freq_promote = 0;
+ int freq_demote = 0;
+ promotion_args_t promotion_args = { 0 };
+ demotion_args_t demotion_args = { 0 };
+ int ret_promotion = 0;
+ int ret_demotion = 0;
+ int ret = 0;
+ pthread_t promote_thread;
+ pthread_t demote_thread;
+
+ conf = this->private;
+
+ bricklist_cold = dict_new();
+ if (!bricklist_cold)
+ return -1;
+
+ bricklist_hot = dict_new();
+ if (!bricklist_hot)
+ return -1;
+
+ tier_get_bricklist (conf->subvolumes[0], bricklist_cold);
+ tier_get_bricklist (conf->subvolumes[1], bricklist_hot);
+
+ freq_promote = defrag->tier_promote_frequency;
+ freq_demote = defrag->tier_demote_frequency;
+
+ next_promote = defrag->tier_promote_frequency % TIMER_SECS;
+ next_demote = defrag->tier_demote_frequency % TIMER_SECS;
+
+
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LOG_TIER_STATUS, "Begin run tier promote %d demote %d",
+ next_promote, next_demote);
+
+ defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
+
+ while (1) {
+
+ sleep(1);
+
+ ret_promotion = -1;
+ ret_demotion = -1;
+
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = 1;
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "defrag->defrag_status != "
+ "GF_DEFRAG_STATUS_STARTED");
+ goto out;
+ }
+
+ tick = (tick + 1) % TIMER_SECS;
+ if ((next_demote != tick) && (next_promote != tick))
+ continue;
+
+ if (next_demote >= tick) {
+ demotion_args.this = this;
+ demotion_args.brick_list = bricklist_hot;
+ demotion_args.defrag = defrag;
+ demotion_args.freq_time = freq_demote;
+ ret_demotion = pthread_create (&demote_thread, NULL,
+ &tier_demote, &demotion_args);
+ if (ret_demotion) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Failed starting Demotion thread!");
+ }
+ freq_demote = defrag->tier_demote_frequency;
+ next_demote = (tick + freq_demote) % TIMER_SECS;
+ }
+
+ if (next_promote >= tick) {
+ promotion_args.this = this;
+ promotion_args.brick_list = bricklist_cold;
+ promotion_args.defrag = defrag;
+ promotion_args.freq_time = freq_promote;
+ ret_promotion = pthread_create (&promote_thread, NULL,
+ &tier_promote, &promotion_args);
+ if (ret_promotion) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Failed starting Promotion thread!");
+ }
+ freq_promote = defrag->tier_promote_frequency;
+ next_promote = (tick + freq_promote) % TIMER_SECS;
+ }
+
+ if (ret_demotion == 0) {
+ pthread_join (demote_thread, NULL);
+ if (demotion_args.return_value) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Demotion failed!");
+ }
+ ret_demotion = demotion_args.return_value;
+ }
+
+ if (ret_promotion == 0) {
+ pthread_join (promote_thread, NULL);
+ if (promotion_args.return_value) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Promotion failed!");
+ }
+ ret_promotion = promotion_args.return_value;
+ }
+
+ /*Collect previous and current cummulative status */
+ ret = ret | ret_demotion | ret_promotion;
+
+ /*reseting promotion and demotion arguments for next iteration*/
+ memset (&demotion_args, 0, sizeof(demotion_args_t));
+ memset (&promotion_args, 0, sizeof(promotion_args_t));
+
+ }
+
+ ret = 0;
+out:
+
+ dict_unref(bricklist_cold);
+ dict_unref(bricklist_hot);
+
+ return ret;
+}
+
+int32_t
+tier_migration_needed (xlator_t *this)
+{
+ gf_defrag_info_t *defrag = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ conf = this->private;
+
+ GF_VALIDATE_OR_GOTO (this->name, conf, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf->defrag, out);
+
+ defrag = conf->defrag;
+
+ if (defrag->cmd == GF_DEFRAG_CMD_START_TIER)
+ ret = 1;
+out:
+ return ret;
+}
+
+int32_t
+tier_migration_get_dst (xlator_t *this, dht_local_t *local)
+{
+ dht_conf_t *conf = NULL;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO("tier", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ if (conf->subvolumes[0] == local->cached_subvol)
+ local->rebalance.target_node =
+ conf->subvolumes[1];
+ else
+ local->rebalance.target_node =
+ conf->subvolumes[0];
+
+ if (local->rebalance.target_node)
+ ret = 0;
+
+out:
+ return ret;
+}
+
+xlator_t *
+tier_search (xlator_t *this, dht_layout_t *layout, const char *name)
+{
+ xlator_t *subvol = NULL;
+ void *value;
+ int search_first_subvol = 0;
+
+ GF_VALIDATE_OR_GOTO("tier", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, layout, out);
+ GF_VALIDATE_OR_GOTO(this->name, name, out);
+
+ if (!dict_get_ptr (this->options, "rule", &value) &&
+ !strcmp(layout->list[0].xlator->name, value)) {
+ search_first_subvol = 1;
+ }
+
+ if (search_first_subvol)
+ subvol = layout->list[0].xlator;
+ else
+ subvol = layout->list[1].xlator;
+
+out:
+ return subvol;
+}
+
+
+dht_methods_t tier_methods = {
+ .migration_get_dst_subvol = tier_migration_get_dst,
+ .migration_other = tier_start,
+ .migration_needed = tier_migration_needed,
+ .layout_search = tier_search,
+};
+
+
+int
+tier_init (xlator_t *this)
+{
+ int ret = -1;
+ int freq = 0;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+
+ ret = dht_init(this);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "dht_init failed");
+ goto out;
+ }
+
+ conf = this->private;
+
+ conf->methods = &tier_methods;
+
+ if (conf->subvolume_cnt != 2) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_LOG_TIER_ERROR,
+ "Invalid number of subvolumes %d", conf->subvolume_cnt);
+ goto out;
+ }
+
+ /* if instatiated from client side initialization is complete. */
+ if (!conf->defrag) {
+ ret = 0;
+ goto out;
+ }
+
+ defrag = conf->defrag;
+
+ GF_OPTION_INIT ("tier-promote-frequency",
+ defrag->tier_promote_frequency,
+ int32, out);
+
+ ret = dict_get_int32 (this->options,
+ "tier-promote-frequency", &freq);
+ if (ret) {
+ freq = DEFAULT_PROMOTE_FREQ_SEC;
+ }
+
+ defrag->tier_promote_frequency = freq;
+
+ GF_OPTION_INIT ("tier-demote-frequency",
+ defrag->tier_demote_frequency,
+ int32, out);
+
+ ret = dict_get_int32 (this->options,
+ "tier-demote-frequency", &freq);
+ if (ret) {
+ freq = DEFAULT_DEMOTE_FREQ_SEC;
+ }
+
+ defrag->tier_demote_frequency = freq;
+
+ GF_OPTION_INIT ("write-freq-threshold",
+ defrag->write_freq_threshold,
+ int32, out);
+
+ GF_OPTION_INIT ("read-freq-threshold",
+ defrag->read_freq_threshold,
+ int32, out);
+
+ gf_msg(this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LOG_TIER_STATUS,
+ "Promote frequency set to %d demote set to %d",
+ defrag->tier_promote_frequency,
+ defrag->tier_demote_frequency);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+
+int
+tier_reconfigure (xlator_t *this, dict_t *options)
+{
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+
+ conf = this->private;
+
+ if (conf->defrag) {
+ defrag = conf->defrag;
+ GF_OPTION_RECONF ("tier-promote-frequency",
+ defrag->tier_promote_frequency, options,
+ int32, out);
+
+ GF_OPTION_RECONF ("tier-demote-frequency",
+ defrag->tier_demote_frequency, options,
+ int32, out);
+
+ GF_OPTION_RECONF ("write-freq-threshold",
+ defrag->write_freq_threshold, options,
+ int32, out);
+
+ GF_OPTION_RECONF ("read-freq-threshold",
+ defrag->read_freq_threshold, options,
+ int32, out);
+ }
+
+out:
+ return dht_reconfigure (this, options);
+}
+
+class_methods_t class_methods = {
+ .init = tier_init,
+ .fini = dht_fini,
+ .reconfigure = tier_reconfigure,
+ .notify = dht_notify
+};
+
+
+struct xlator_fops fops = {
+ .lookup = dht_lookup,
+ .create = dht_create,
+ .mknod = dht_mknod,
+
+ .stat = dht_stat,
+ .fstat = dht_fstat,
+ .truncate = dht_truncate,
+ .ftruncate = dht_ftruncate,
+ .access = dht_access,
+ .readlink = dht_readlink,
+ .setxattr = dht_setxattr,
+ .getxattr = dht_getxattr,
+ .removexattr = dht_removexattr,
+ .open = dht_open,
+ .readv = dht_readv,
+ .writev = dht_writev,
+ .flush = dht_flush,
+ .fsync = dht_fsync,
+ .statfs = dht_statfs,
+ .lk = dht_lk,
+ .opendir = dht_opendir,
+ .readdir = dht_readdir,
+ .readdirp = dht_readdirp,
+ .fsyncdir = dht_fsyncdir,
+ .symlink = dht_symlink,
+ .unlink = dht_unlink,
+ .link = dht_link,
+ .mkdir = dht_mkdir,
+ .rmdir = dht_rmdir,
+ .rename = dht_rename,
+ .inodelk = dht_inodelk,
+ .finodelk = dht_finodelk,
+ .entrylk = dht_entrylk,
+ .fentrylk = dht_fentrylk,
+ .xattrop = dht_xattrop,
+ .fxattrop = dht_fxattrop,
+ .setattr = dht_setattr,
+};
+
+
+struct xlator_cbks cbks = {
+ .forget = dht_forget
+};
+
diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h
new file mode 100644
index 00000000000..73266050a5c
--- /dev/null
+++ b/xlators/cluster/dht/src/tier.h
@@ -0,0 +1,71 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _TIER_H_
+#define _TIER_H_
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+/******************************************************************************/
+/* This is from dht-rebalancer.c as we dont have dht-rebalancer.h */
+#include "dht-common.h"
+#include "xlator.h"
+#include <signal.h>
+#include <fnmatch.h>
+#include <signal.h>
+
+#define DEFAULT_PROMOTE_FREQ_SEC 120
+#define DEFAULT_DEMOTE_FREQ_SEC 120
+
+/*
+ * Size of timer wheel. We would not promote or demote lesd
+ * frequently than this number.
+ */
+#define TIMER_SECS 3600
+
+#include "gfdb_data_store.h"
+#include <ctype.h>
+#include <sys/xattr.h>
+#include <sys/stat.h>
+
+#define DEMOTION_QFILE "/var/run/gluster/demotequeryfile"
+#define PROMOTION_QFILE "/var/run/gluster/promotequeryfile"
+
+#define GET_QFILE_PATH(is_promotion)\
+ (is_promotion) ? PROMOTION_QFILE : DEMOTION_QFILE
+
+typedef struct _query_cbk_args {
+ xlator_t *this;
+ gf_defrag_info_t *defrag;
+ FILE *queryFILE;
+} query_cbk_args_t;
+
+int
+gf_run_tier(xlator_t *this, gf_defrag_info_t *defrag);
+
+typedef struct _gfdb_brick_dict_info {
+ gfdb_time_t *time_stamp;
+ gf_boolean_t _gfdb_promote;
+ query_cbk_args_t *_query_cbk_args;
+} _gfdb_brick_dict_info_t;
+
+typedef struct _dm_thread_args {
+ xlator_t *this;
+ gf_defrag_info_t *defrag;
+ dict_t *brick_list;
+ int freq_time;
+ int return_value;
+} promotion_args_t, demotion_args_t;
+
+#endif