diff options
| -rw-r--r-- | libglusterfs/src/dict.c | 26 | ||||
| -rw-r--r-- | libglusterfs/src/dict.h | 1 | ||||
| -rw-r--r-- | libglusterfs/src/gfdb/gfdb_data_store.h | 1 | ||||
| -rw-r--r-- | libglusterfs/src/gfdb/gfdb_sqlite3.h | 1 | ||||
| -rwxr-xr-x | tests/basic/tier/tier.t | 116 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/Makefile.am | 11 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 100 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 29 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 22 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-inode-read.c | 1 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-messages.h | 19 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 28 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 31 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht.c | 7 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/nufa.c | 5 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier.c | 1007 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier.h | 71 | 
17 files changed, 1443 insertions, 33 deletions
diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c index 31867588def..81db64dfd40 100644 --- a/libglusterfs/src/dict.c +++ b/libglusterfs/src/dict.c @@ -2156,6 +2156,32 @@ err:          return ret;  } +int +dict_add_dynstr_with_alloc (dict_t *this, char *key, char *str) +{ +        data_t  *data = NULL; +        int      ret  = 0; +        char    *alloc_str = NULL; + +        alloc_str = gf_strdup (str); +        if (!alloc_str) +                goto out; + +        data = data_from_dynstr (alloc_str); +        if (!data) { +                GF_FREE (alloc_str); +                ret = -EINVAL; +                goto out; +        } + +        ret = dict_add (this, key, data); +        if (ret < 0) +                data_destroy (data); + +out: +        return ret; +} +  /*    for malloced strings we should do a free instead of GF_FREE  */ diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h index a4c1815d06a..a1a4c85f711 100644 --- a/libglusterfs/src/dict.h +++ b/libglusterfs/src/dict.h @@ -244,6 +244,7 @@ GF_MUST_CHECK int dict_set_str (dict_t *this, char *key, char *str);  GF_MUST_CHECK int dict_set_dynmstr (dict_t *this, char *key, char *str);  GF_MUST_CHECK int dict_set_dynstr (dict_t *this, char *key, char *str);  GF_MUST_CHECK int dict_set_dynstr_with_alloc (dict_t *this, char *key, const char *str); +GF_MUST_CHECK int dict_add_dynstr_with_alloc (dict_t *this, char *key, char *str);  GF_MUST_CHECK int dict_get_str (dict_t *this, char *key, char **str);  GF_MUST_CHECK int dict_get_str_boolean (dict_t *this, char *key, int default_val); diff --git a/libglusterfs/src/gfdb/gfdb_data_store.h b/libglusterfs/src/gfdb/gfdb_data_store.h index 1212c2b3fe1..ffe590b84bd 100644 --- a/libglusterfs/src/gfdb/gfdb_data_store.h +++ b/libglusterfs/src/gfdb/gfdb_data_store.h @@ -26,7 +26,6 @@  #include "gfdb_data_store_types.h"  #include "gfdb_sqlite3.h" -  /* GFDB Connection Node:   * ~~~~~~~~~~~~~~~~~~~~   * Represents the connection to the database while using libgfdb diff --git a/libglusterfs/src/gfdb/gfdb_sqlite3.h b/libglusterfs/src/gfdb/gfdb_sqlite3.h index b0c4b2b5fdd..04bfde7fa38 100644 --- a/libglusterfs/src/gfdb/gfdb_sqlite3.h +++ b/libglusterfs/src/gfdb/gfdb_sqlite3.h @@ -143,7 +143,6 @@ do {\          };\  } while (0) -  #define GF_SQLITE3_SET_PRAGMA(sqlite3_config_str, param_key, format, value,\                          ret, error)\  do {\ diff --git a/tests/basic/tier/tier.t b/tests/basic/tier/tier.t new file mode 100755 index 00000000000..6bd6fdf8849 --- /dev/null +++ b/tests/basic/tier/tier.t @@ -0,0 +1,116 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function file_on_slow_tier { +    s=$(md5sum $1) +    for i in `seq 0 $LAST_BRICK`; do +        test -e $B0/${V0}${i}/$1 && break; +    done +    if [ $? -eq 0 ] && ! [ "`md5sum $B0/${V0}${i}/$1`" == "$s" ]; then +        echo "0" +    else +        echo "1" +    fi +} + +function file_on_fast_tier { +    local ret="1" + +    s1=$(md5sum $1) +    s2=$(md5sum $B0/${V0}${CACHE_BRICK}/$1) +    if [ -e $B0/${V0}${CACHE_BRICK}/$1 ] && ! [ "$s1" == "$s2" ]; then +        echo "0" +    else +        echo "1" +    fi +} + +function confirm_tier_removed { +    $CLI system getspec $V0 | grep $1 +    if [ $? == 0 ] ; then +        echo "1" +    else +        echo "0" +    fi +} + +LAST_BRICK=1 +CACHE_BRICK=2 +DEMOTE_TIMEOUT=12 +PROMOTE_TIMEOUT=5 +cleanup + + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 $H0:$B0/${V0}{0..$LAST_BRICK} +TEST $CLI volume attach-tier $V0 $H0:$B0/${V0}${CACHE_BRICK} +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 features.ctr-enabled on +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; + +# Basic operations. +cd $M0 +TEST stat . +TEST mkdir d1 +TEST [ -d d1 ] +TEST touch d1/file1 +TEST mkdir d1/d2 +TEST [ -d d1/d2 ] +TEST find d1 + +# Create a file. It should be on the fast tier. +uuidgen > d1/data.txt +TEST file_on_fast_tier d1/data.txt + +# Check manual demotion. +#TEST setfattr -n trusted.distribute.migrate-data d1/data.txt +#TEST file_on_slow_tier d1/data.txt + +TEST $CLI volume set $V0 cluster.tier-demote-frequency 4 +TEST $CLI volume set $V0 cluster.tier-promote-frequency 4 +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume rebalance $V0 tier start +uuidgen > d1/data2.txt +uuidgen > d1/data3.txt +EXPECT "0" file_on_fast_tier d1/data2.txt +EXPECT "0" file_on_fast_tier d1/data3.txt + +# Check auto-demotion on write new. +EXPECT_WITHIN $DEMOTE_TIMEOUT "0" file_on_slow_tier d1/data2.txt +EXPECT_WITHIN $DEMOTE_TIMEOUT "0" file_on_slow_tier d1/data3.txt +sleep 12 +# Check auto-promotion on write append. +uuidgen >> d1/data2.txt + +# Check promotion on read to slow tier +echo 3 > /proc/sys/vm/drop_caches +cat d1/data3.txt +sleep 5 +EXPECT_WITHIN $PROMOTE_TIMEOUT "0" file_on_fast_tier d1/data2.txt +EXPECT_WITHIN $PROMOTE_TIMEOUT "0" file_on_fast_tier d1/data3.txt + +# Test rebalance commands +TEST $CLI volume rebalance $V0 tier status +TEST $CLI volume rebalance $V0 stop + +# stop gluster, when it comes back info file should have tiered volume +killall glusterd +TEST glusterd + +# TBD: Remove force. Gracefully migrate data off hot tier. +# Rebalance+promotion/demotion is under construction. + +TEST $CLI volume detach-tier $V0 + +# temporarily comment out +#TEST ! [ -e $M0/d1/data.txt ] + +EXPECT "0" confirm_tier_removed ${V0}${CACHE_BRICK} + +TEST $CLI volume stop $V0 + +cleanup diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index 8d02749f4d9..46dc4bb840f 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -1,4 +1,4 @@ -xlator_LTLIBRARIES = dht.la nufa.la switch.la +xlator_LTLIBRARIES = dht.la nufa.la switch.la tier.la  xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster  dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ @@ -10,6 +10,7 @@ dht_la_SOURCES = $(dht_common_source) dht.c  nufa_la_SOURCES = $(dht_common_source) nufa.c  switch_la_SOURCES = $(dht_common_source) switch.c +tier_la_SOURCES = $(dht_common_source) tier.c  dht_la_LDFLAGS = -module -avoid-version  dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la @@ -20,10 +21,16 @@ nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la  switch_la_LDFLAGS = -module -avoid-version  switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h \ +tier_la_LDFLAGS = -module -avoid-version +tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\ +		 $(top_builddir)/libglusterfs/src/gfdb/libgfdb.la +AM_CFLAGS = -Wall $(GF_CFLAGS) $(SQLITE_CFLAGS) + +noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier.h\  	$(top_builddir)/xlators/lib/src/libxlator.h  AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ +	-I$(top_srcdir)/libglusterfs/src/gfdb \  	-I$(top_srcdir)/xlators/lib/src  AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 9fda4aa07d6..3a196e07be9 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3175,6 +3175,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,          xlator_t     *subvol   = NULL;          dht_local_t  *local    = NULL;          dht_conf_t   *conf     = NULL; +        dht_methods_t *methods = NULL;          dht_layout_t *layout   = NULL;          int           i        = 0;          int           op_errno = EINVAL; @@ -3191,6 +3192,10 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,          VALIDATE_OR_GOTO (loc->inode, err);          conf   = this->private; +        GF_VALIDATE_OR_GOTO (this->name, conf, err); + +        methods = conf->methods; +        GF_VALIDATE_OR_GOTO (this->name, conf->methods, err);          GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,                                     op_errno, err); @@ -3255,8 +3260,8 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,                          goto err;                  } -                local->rebalance.target_node = -                        dht_subvol_get_hashed (this, &local->loc); +                methods->migration_get_dst_subvol(this, local); +                  if (!local->rebalance.target_node) {                          gf_msg (this->name, GF_LOG_ERROR, 0,                                  DHT_MSG_HASHED_SUBVOL_GET_FAILED, @@ -3719,7 +3724,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)          VALIDATE_OR_GOTO (frame, err);          VALIDATE_OR_GOTO (this, err);          VALIDATE_OR_GOTO (loc, err); -        VALIDATE_OR_GOTO (loc->inode, err);          VALIDATE_OR_GOTO (this->private, err);          conf = this->private; @@ -3730,7 +3734,7 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)                  goto err;          } -        if (IA_ISDIR (loc->inode->ia_type)) { +        if (!loc->inode || IA_ISDIR (loc->inode->ia_type)) {                  local->call_cnt = conf->subvolume_cnt;                  for (i = 0; i < conf->subvolume_cnt; i++) { @@ -3820,6 +3824,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,          int           count = 0;          dht_layout_t *layout = 0;          dht_conf_t   *conf   = NULL; +        dht_methods_t *methods = NULL;          xlator_t     *subvol = 0;          xlator_t     *hashed_subvol = 0;          int           ret    = 0; @@ -3828,7 +3833,12 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,          INIT_LIST_HEAD (&entries.list);          prev = cookie;          local = frame->local; +          conf  = this->private; +        GF_VALIDATE_OR_GOTO(this->name, conf, unwind); + +        methods = conf->methods; +        GF_VALIDATE_OR_GOTO(this->name, conf->methods, done);          if (op_ret < 0)                  goto done; @@ -3867,8 +3877,8 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,                          } -                        hashed_subvol = dht_layout_search (this, layout, \ -                                                           orig_entry->d_name); +                        hashed_subvol = methods->layout_search (this, layout, +                                                         orig_entry->d_name);                          if (prev->this == hashed_subvol)                                  goto list; @@ -3894,8 +3904,8 @@ list:                  /* Do this if conf->search_unhashed is set to "auto" */                  if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) { -                        subvol = dht_layout_search (this, layout, -                                                    orig_entry->d_name); +                        subvol = methods->layout_search (this, layout, +                                                         orig_entry->d_name);                          if (!subvol || (subvol != prev->this)) {                                  /* TODO: Count the number of entries which need                                     linkfile to prove its existence in fs */ @@ -4008,11 +4018,19 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          int           count = 0;          dht_layout_t *layout = 0;          xlator_t     *subvol = 0; +        dht_conf_t   *conf = NULL; +        dht_methods_t *methods = NULL;          INIT_LIST_HEAD (&entries.list);          prev = cookie;          local = frame->local; +        conf = this->private; +        GF_VALIDATE_OR_GOTO (this->name, conf, done); + +        methods = conf->methods; +        GF_VALIDATE_OR_GOTO (this->name, conf->methods, done); +          if (op_ret < 0)                  goto done; @@ -4024,7 +4042,8 @@ dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          list_for_each_entry (orig_entry, (&orig_entries->list), list) {                  next_offset = orig_entry->d_off; -                subvol = dht_layout_search (this, layout, orig_entry->d_name); +                subvol = methods->layout_search (this, layout, +                                                 orig_entry->d_name);                  if (!subvol || (subvol == prev->this)) {                          entry = gf_dirent_for_name (orig_entry->d_name); @@ -6073,11 +6092,13 @@ dht_notify (xlator_t *this, int event, void *data, ...)          gf_defrag_type           cmd    = 0;          dict_t                  *output = NULL;          va_list                  ap; - +        dht_methods_t           *methods = NULL;          conf = this->private; -        if (!conf) -                return ret; +        GF_VALIDATE_OR_GOTO (this->name, conf, out); + +        methods = conf->methods; +        GF_VALIDATE_OR_GOTO (this->name, methods, out);          /* had all subvolumes reported status once till now? */          had_heard_from_all = 1; @@ -6271,12 +6292,18 @@ unlock:                   * not need to handle CHILD_DOWN event here.                   */                  if (conf->defrag) { -                        ret = gf_thread_create (&conf->defrag->th, NULL, -						gf_defrag_start, this); -                        if (ret) { -                                conf->defrag = NULL; +                        if (methods->migration_needed(this)) { +                                ret = gf_thread_create(&conf->defrag->th, +                                                       NULL, +                                                       gf_defrag_start, this); +                                if (ret) { +                                        GF_FREE (conf->defrag); +                                        conf->defrag = NULL; +                                        kill (getpid(), SIGTERM); +                                } +                        } else {                                  GF_FREE (conf->defrag); -                                kill (getpid(), SIGTERM); +                                conf->defrag = NULL;                          }                  }          } @@ -6284,7 +6311,7 @@ unlock:          ret = 0;          if (propagate)                  ret = default_notify (this, event, data); - +out:          return ret;  } @@ -6412,3 +6439,40 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,  err:          GF_FREE (output_string);  } + +int32_t dht_migration_get_dst_subvol(xlator_t *this, dht_local_t  *local) +{ +        int ret = -1; + +        if (!local) +                goto out; + +        local->rebalance.target_node = +                dht_subvol_get_hashed (this, &local->loc); + +        if (local->rebalance.target_node) +                ret = 0; + +out: +        return ret; +} + +int32_t dht_migration_needed(xlator_t *this) +{ +        gf_defrag_info_t        *defrag = NULL; +        dht_conf_t              *conf   = NULL; +        int                      ret = 0; + +        conf = this->private; + +        GF_VALIDATE_OR_GOTO ("dht", conf, out); +        GF_VALIDATE_OR_GOTO ("dht", conf->defrag, out); + +        defrag = conf->defrag; + +        if (defrag->cmd != GF_DEFRAG_CMD_START_TIER) +                ret = 1; + +out: +        return ret; +} diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 67e693146af..9145f336d7c 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -267,6 +267,8 @@ enum gf_defrag_type {          GF_DEFRAG_CMD_STATUS = 1 + 2,          GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3,          GF_DEFRAG_CMD_START_FORCE = 1 + 4, +        GF_DEFRAG_CMD_START_TIER = 1 + 5, +        GF_DEFRAG_CMD_STATUS_TIER = 1 + 6,  };  typedef enum gf_defrag_type gf_defrag_type; @@ -310,10 +312,31 @@ struct gf_defrag_info_ {          struct timeval               start_time;          gf_boolean_t                 stats;          gf_defrag_pattern_list_t    *defrag_pattern; +        int                          tier_promote_frequency; +        int                          tier_demote_frequency; + +        /*Data Tiering params for scanner*/ +        uint64_t                     total_files_promoted; +        uint64_t                     total_files_demoted; +        int                          write_freq_threshold; +        int                          read_freq_threshold;  };  typedef struct gf_defrag_info_ gf_defrag_info_t; +struct dht_methods_s { +        int32_t      (*migration_get_dst_subvol)(xlator_t *this, +                                                 dht_local_t *local); +        int32_t      (*migration_other)(xlator_t *this, +                                        gf_defrag_info_t *defrag); +        int32_t      (*migration_needed)(xlator_t *this); +        xlator_t*    (*layout_search)(xlator_t *this, +                                      dht_layout_t *layout, +                                         const char *name); +}; + +typedef struct dht_methods_s dht_methods_t; +  struct dht_conf {          gf_lock_t      subvolume_lock;          int            subvolume_cnt; @@ -371,6 +394,8 @@ struct dht_conf {          gf_boolean_t    do_weighting;          gf_boolean_t    randomize_by_gfid; +        dht_methods_t  *methods; +          struct mem_pool *lock_pool;  };  typedef struct dht_conf dht_conf_t; @@ -477,6 +502,10 @@ dht_layout_t                            *dht_layout_get (xlator_t *this, inode_t  dht_layout_t                            *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol);  xlator_t *dht_layout_search (xlator_t   *this, dht_layout_t *layout,                               const char *name); +int32_t +dht_migration_get_dst_subvol(xlator_t *this, dht_local_t  *local); +int32_t +dht_migration_needed(xlator_t *this);  int                                      dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout);  int dht_layout_anomalies (xlator_t      *this, loc_t *loc, dht_layout_t *layout,                            uint32_t      *holes_p, uint32_t *overlaps_p, diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index f4e5305d791..346d19bec88 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -502,10 +502,18 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc)  {          dht_layout_t *layout = NULL;          xlator_t     *subvol = NULL; +        dht_conf_t *conf = NULL; +        dht_methods_t *methods = NULL;          GF_VALIDATE_OR_GOTO ("dht", this, out);          GF_VALIDATE_OR_GOTO (this->name, loc, out); +        conf = this->private; +        GF_VALIDATE_OR_GOTO (this->name, conf, out); + +        methods = conf->methods; +        GF_VALIDATE_OR_GOTO (this->name, conf->methods, out); +          if (__is_root_gfid (loc->gfid)) {                  subvol = dht_first_up_subvol (this);                  goto out; @@ -523,7 +531,7 @@ dht_subvol_get_hashed (xlator_t *this, loc_t *loc)                  goto out;          } -        subvol = dht_layout_search (this, layout, loc->name); +        subvol = methods->layout_search (this, layout, loc->name);          if (!subvol) {                  gf_msg_debug (this->name, 0, @@ -846,6 +854,18 @@ dht_migration_complete_check_task (void *data)                  SYNCTASK_SETID (frame->root->uid, frame->root->gid);          } +        /* +         * temporary check related to tier promoting/demoting the file; +         * the lower level DHT detects the migration (due to sticky +         * bits) when it is the responsibility of the tier translator +         * to complete the rebalance transaction. It will be corrected +         * when rebalance and tier migration are fixed to work together. +         */ +        if (strcmp(this->parents->xlator->type, "cluster/tier") == 0) { +                ret = 0; +                goto out; +        } +          if (!ret)                  dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index d78dd2ea0ef..78e3ef4233b 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -1064,7 +1064,6 @@ dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,          VALIDATE_OR_GOTO (this, err);          VALIDATE_OR_GOTO (loc, err);          VALIDATE_OR_GOTO (loc->inode, err); -        VALIDATE_OR_GOTO (loc->path, err);          local = dht_local_init (frame, loc, NULL, GF_FOP_INODELK);          if (!local) { diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h index f4096ae8f1e..eb4c356e3c2 100644 --- a/xlators/cluster/dht/src/dht-messages.h +++ b/xlators/cluster/dht/src/dht-messages.h @@ -441,12 +441,25 @@  #define DHT_MSG_LOG_FIXED_LAYOUT     (GLFS_DHT_BASE + 36) -/*------------*/ -#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" +/* + * @messageid 109037 + * @diagnosis Informational message regarding error in tier operation + * @recommendedaction None + */ +#define DHT_MSG_LOG_TIER_ERROR     (GLFS_DHT_BASE + 37) -#endif /* _DHT_MESSAGES_H_ */ +/* + * @messageid 109038 + * @diagnosis Informational message regarding tier operation + * @recommendedaction None + */ +#define DHT_MSG_LOG_TIER_STATUS     (GLFS_DHT_BASE + 38) +/*------------*/ +#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" + +#endif /* _DHT_MESSAGES_H_ */ diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 3531872dd31..b838ecec4b7 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -19,6 +19,7 @@  #include <signal.h>  #include <fnmatch.h>  #include <signal.h> +#include "tier.h"  #define GF_DISK_SECTOR_SIZE             512  #define DHT_REBALANCE_PID               4242 /* Change it if required */ @@ -919,6 +920,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,          tmp_loc.inode = inode_ref (loc->inode);          uuid_copy (tmp_loc.gfid, loc->gfid); +        tmp_loc.path = gf_strdup(loc->path);          ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW,                                &flock, NULL, NULL); @@ -984,6 +986,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,                  goto out;          ret = __dht_check_free_space (to, from, loc, &stbuf, flag); +          if (ret) {                  goto out;          } @@ -1713,7 +1716,8 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                  goto out;          } -        if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { +        if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && +            (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {                  ret = gf_defrag_migrate_data (this, defrag, loc, migrate_data);                  if (ret)                          goto out; @@ -1863,7 +1867,6 @@ out:  } -  int  gf_defrag_start_crawl (void *data)  { @@ -1878,6 +1881,7 @@ gf_defrag_start_crawl (void *data)          dict_t                  *migrate_data = NULL;          dict_t                  *status = NULL;          glusterfs_ctx_t         *ctx = NULL; +        dht_methods_t           *methods = NULL;          this = data;          if (!this) @@ -1942,7 +1946,8 @@ gf_defrag_start_crawl (void *data)                  goto out;          } -        if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { +        if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && +            (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {                  migrate_data = dict_new ();                  if (!migrate_data) {                          ret = -1; @@ -1959,15 +1964,28 @@ gf_defrag_start_crawl (void *data)                  if (ret)                          goto out;          } +          ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout,                                      migrate_data); + +        if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) { +                methods = conf->methods; +                if (!methods) { +                        gf_msg (this->name, GF_LOG_ERROR, 0, +                                DHT_MSG_LOG_TIER_ERROR, +                                "Methods invalid for translator."); +                        defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; +                        ret = -1; +                        goto out; +                } +                methods->migration_other(this, defrag); +        } +          if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) &&              (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) {                  defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;          } - -  out:          LOCK (&defrag->lock);          { diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 860f3e716f0..1e666bd8140 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -29,6 +29,8 @@  */  struct volume_options options[]; +extern dht_methods_t dht_methods; +  void  dht_layout_dump (dht_layout_t  *layout, const char *prefix)  { @@ -701,6 +703,8 @@ dht_init (xlator_t *this)          if (dht_set_subvol_range(this))                  goto err; +        conf->methods = &dht_methods; +          return 0;  err: @@ -847,6 +851,33 @@ struct volume_options options[] = {            .type = GF_OPTION_TYPE_XLATOR          }, +        /* tier options */ +        { .key  = {"tier-promote-frequency"}, +          .type = GF_OPTION_TYPE_INT, +          .default_value = "120", +          .description = "Frequency to promote files to fast tier" +        }, + +        { .key  = {"tier-demote-frequency"}, +          .type = GF_OPTION_TYPE_INT, +          .default_value = "120", +          .description = "Frequency to demote files to slow tier" +        }, + +        { .key  = {"write-freq-thresold"}, +          .type = GF_OPTION_TYPE_INT, +          .default_value = "0", +          .description = "Defines the write fequency " +                        "that would be considered hot" +        }, + +        { .key  = {"read-freq-thresold"}, +          .type = GF_OPTION_TYPE_INT, +          .default_value = "0", +          .description = "Defines the read fequency " +                        "that would be considered hot" +        }, +          /* switch option */          { .key  = {"pattern.switch.case"},            .type = GF_OPTION_TYPE_ANY diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index fc0ca2f7735..3934df5ec64 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -17,6 +17,12 @@  #include "statedump.h"  #include "dht-common.h" +dht_methods_t dht_methods = { +        .migration_get_dst_subvol = dht_migration_get_dst_subvol, +        .migration_needed = dht_migration_needed, +        .layout_search   = dht_layout_search, +}; +  class_methods_t class_methods = {          .init           = dht_init,          .fini           = dht_fini, @@ -86,4 +92,3 @@ struct xlator_cbks cbks = {  //      .releasedir = dht_releasedir,          .forget     = dht_forget  }; -; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index f188a5479f4..72d6d9c10e5 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -621,6 +621,11 @@ nufa_init (xlator_t *this)          return 0;  } +dht_methods_t dht_methods = { +        .migration_get_dst_subvol = dht_migration_get_dst_subvol, +        .migration_needed = dht_migration_needed, +        .layout_search   = dht_layout_search, +};  class_methods_t class_methods = {          .init           = nufa_init, diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c new file mode 100644 index 00000000000..028a42f7a1a --- /dev/null +++ b/xlators/cluster/dht/src/tier.c @@ -0,0 +1,1007 @@ +/* +  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.h" +#include "tier.h" + +/*Hard coded DB info*/ +static gfdb_db_type_t dht_tier_db_type = GFDB_SQLITE3; +/*Hard coded DB info*/ + +/*Mutex for updating the data movement stats*/ +static pthread_mutex_t dm_stat_mutex = PTHREAD_MUTEX_INITIALIZER; + +#define DB_QUERY_RECORD_SIZE 4096 + +static int +tier_parse_query_str (char *query_record_str, +                      char *gfid, char *link_buffer, ssize_t *link_size) +{ +        char *token_str = NULL; +        char *delimiter = "|"; +        char *saveptr = NULL; +        int ret = -1; + +        GF_VALIDATE_OR_GOTO ("tier", query_record_str, out); +        GF_VALIDATE_OR_GOTO ("tier", gfid, out); +        GF_VALIDATE_OR_GOTO ("tier", link_buffer, out); +        GF_VALIDATE_OR_GOTO ("tier", link_size, out); + +        token_str = strtok_r (query_record_str, delimiter, &saveptr); +        if (!token_str) +                goto out; + +        strcpy (gfid, token_str); + + +        token_str = strtok_r (NULL, delimiter, &saveptr); +        if (!token_str) +                goto out; + +        strcpy (link_buffer, token_str); + +        token_str = strtok_r (NULL, delimiter, &saveptr); +        if (!token_str) +                goto out; + +        *link_size = atoi (token_str); + +        ret = 0; +out: +        return ret; +} + +static int +tier_migrate_using_query_file (void *_args) +{ +        int ret                                 = -1; +        char gfid_str[UUID_CANONICAL_FORM_LEN+1] = ""; +        char query_record_str[4096]             = ""; +        query_cbk_args_t *query_cbk_args       = (query_cbk_args_t *) _args; +        xlator_t *this                          = NULL; +        gf_defrag_info_t *defrag                = NULL; +        char *token_str                         = NULL; +        char *delimiter                         = "::"; +        char *link_buffer                       = NULL; +        gfdb_query_record_t *query_record       = NULL; +        gfdb_link_info_t *link_info             = NULL; +        struct iatt par_stbuf                      = {0,}; +        struct iatt current                     = {0,}; +        loc_t p_loc                             = {0,}; +        loc_t loc                               = {0,}; +        dict_t *migrate_data                    = NULL; +        inode_t *linked_inode                   = NULL; +        int per_file_status                     = 0; +        int per_link_status                     = 0; +        int total_status                        = 0; +        FILE *queryFILE                         = NULL; +        char *link_str                          = NULL; + +        GF_VALIDATE_OR_GOTO ("tier", query_cbk_args, out); +        GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out); +        this = query_cbk_args->this; +        GF_VALIDATE_OR_GOTO (this->name, query_cbk_args->defrag, out); +        GF_VALIDATE_OR_GOTO (this->name, query_cbk_args->queryFILE, out); + +        defrag = query_cbk_args->defrag; + +        queryFILE = query_cbk_args->queryFILE; + +        query_record = gfdb_query_record_init(); +        if (!query_record) { +                goto out; +        } + +        query_record->_link_info_str = calloc (DB_QUERY_RECORD_SIZE, 1); +        if (!query_record->_link_info_str) { +                goto out; +        } +        link_buffer = query_record->_link_info_str; + +        link_info = gfdb_link_info_init (); + +        migrate_data = dict_new (); +        if (!migrate_data) +                goto out; + +        /* Per file */ +        while (fscanf (queryFILE, "%s", query_record_str) != EOF) { + +                per_file_status      = 0; +                per_link_status      = 0; + +                memset (gfid_str, 0, UUID_CANONICAL_FORM_LEN+1); +                memset (query_record->_link_info_str, 0, DB_QUERY_RECORD_SIZE); + +                if (tier_parse_query_str (query_record_str, gfid_str, +                                          link_buffer, +                                          &query_record->link_info_size)) { +                        gf_msg (this->name, GF_LOG_ERROR, 0, +                                DHT_MSG_LOG_TIER_ERROR, +                                "failed parsing %s\n", query_record_str); +                        continue; +                } + +                uuid_parse (gfid_str, query_record->gfid); + +                if (dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY)) +                        dict_del(migrate_data, GF_XATTR_FILE_MIGRATE_KEY); + +                if (dict_get(migrate_data, "from.migrator")) +                        dict_del(migrate_data, "from.migrator"); + +                token_str = strtok (link_buffer, delimiter); +                if (token_str != NULL) { +                        per_file_status = +                                dict_set_str (migrate_data, +                                              GF_XATTR_FILE_MIGRATE_KEY, +                                              "force"); +                        if (per_file_status) { +                                goto per_file_out; +                        } + +                        /* Flag to suggest the xattr call is from migrator */ +                        per_file_status = dict_set_str (migrate_data, +                                "from.migrator", "yes"); +                        if (per_file_status) { +                                goto per_file_out; +                        } +                } +                per_link_status = 0; +                /* Per link of file */ +                while (token_str != NULL) { + +                        link_str = gf_strdup (token_str); + +                        if (!link_info) { +                                per_link_status = -1; +                                goto per_file_out; +                        } + +                        memset (link_info, 0, sizeof(gfdb_link_info_t)); + +                        ret = str_to_link_info (link_str, link_info); +                        if (ret) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, +                                        "failed parsing %s\n", link_str); +                                per_link_status = -1; +                                goto error; +                        } + +                        uuid_copy (p_loc.gfid, link_info->pargfid); + +                        p_loc.inode = inode_new (defrag->root_inode->table); +                        if (!p_loc.inode) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, +                                        "failed parsing %s\n", link_str); +                                per_link_status = -1; +                                goto error; +                        } + +                        ret = syncop_lookup (this, &p_loc, NULL, &par_stbuf, +                                             NULL, NULL); +                        if (ret) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, +                                        " ERROR in parent lookup\n"); +                                per_link_status = -1; +                                goto error; +                        } +                        linked_inode = inode_link (p_loc.inode, NULL, NULL, +                                                        &par_stbuf); +                        inode_unref (p_loc.inode); +                        p_loc.inode = linked_inode; + +                        uuid_copy (loc.gfid, query_record->gfid); +                        loc.inode = inode_new (defrag->root_inode->table); +                        uuid_copy (loc.pargfid, link_info->pargfid); +                        loc.parent = inode_ref(p_loc.inode); + +                        loc.name = gf_strdup (link_info->file_name); +                        if (!loc.name) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, "ERROR in " +                                        "memory allocation\n"); +                                per_link_status = -1; +                                goto error; +                        } + +                        loc.path = gf_strdup (link_info->file_path); +                        if (!loc.path) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, "ERROR in " +                                        "memory allocation\n"); +                                per_link_status = -1; +                                goto error; +                        } + +                        uuid_copy (loc.parent->gfid, link_info->pargfid); + +                        ret = syncop_lookup (this, &loc, NULL, ¤t, +                                             NULL, NULL); +                        if (ret) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, "ERROR in " +                                        "current lookup\n"); +                                per_link_status = -1; +                                goto error; +                        } +                        linked_inode = inode_link (loc.inode, NULL, NULL, +                                                        ¤t); +                        inode_unref (loc.inode); +                        loc.inode = linked_inode; + +                        gf_msg (this->name, GF_LOG_INFO, 0, +                                DHT_MSG_LOG_TIER_STATUS, "Tier migrate file %s", +                                loc.name); + +                        ret = syncop_setxattr (this, &loc, migrate_data, 0); +                        if (ret) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, "ERROR %d in " +                                        "current migration %s %s\n", ret, +                                        loc.name, +                                        loc.path); +                                per_link_status = -1; +                                goto error; +                        } +                        inode_unref (loc.inode); +                        inode_unref (loc.parent); +                        inode_unref (p_loc.inode); +error: +                        if (loc.name) { +                                GF_FREE ((char *) loc.name); +                        } +                        if (loc.path) { +                                GF_FREE ((char *) loc.path); +                        } +                        token_str = NULL; +                        token_str = strtok (NULL, delimiter); +                        GF_FREE (link_str); +                } +                per_file_status = per_link_status; +per_file_out: +                if (per_file_status) { +                        pthread_mutex_lock (&dm_stat_mutex); +                        defrag->total_failures++; +                        pthread_mutex_unlock (&dm_stat_mutex); +                } else { +                        pthread_mutex_lock (&dm_stat_mutex); +                        defrag->total_files++; +                        pthread_mutex_unlock (&dm_stat_mutex); +                } +                total_status = total_status + per_file_status; +                per_link_status = 0; +                per_file_status = 0; +                query_record_str[0] = '\0'; +        } + +out: +        if (link_buffer) +                free (link_buffer); +        gfdb_link_info_fini (&link_info); +        if (migrate_data) +                dict_unref (migrate_data); +        gfdb_query_record_fini (&query_record); +        return total_status; +} + + +/*This is the call back function per record/file from data base*/ +static int +tier_gf_query_callback (gfdb_query_record_t *gfdb_query_record, +                        void *_args) { +        int ret = -1; +        char gfid_str[UUID_CANONICAL_FORM_LEN] = ""; +        query_cbk_args_t *query_cbk_args = _args; + +        GF_VALIDATE_OR_GOTO ("tier", query_cbk_args, out); +        GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->defrag, out); +        GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->queryFILE, out); + +        uuid_unparse (gfdb_query_record->gfid, gfid_str); +        fprintf (query_cbk_args->queryFILE, "%s|%s|%ld\n", gfid_str, +                 gfdb_query_record->_link_info_str, +                 gfdb_query_record->link_info_size); + +        pthread_mutex_lock (&dm_stat_mutex); +        query_cbk_args->defrag->num_files_lookedup++; +        pthread_mutex_unlock (&dm_stat_mutex); + +        ret = 0; +out: +        return ret; +} + +/*This is the call back function for each brick from hot/cold bricklist + * It picks up each bricks db and queries for eligible files for migration. + * The list of eligible files are populated in appropriate query files*/ +static int +tier_process_brick_cbk (dict_t *brick_dict, char *key, data_t *value, +                        void *args) { +        int ret                                         = -1; +        char *db_path                                   = NULL; +        query_cbk_args_t *query_cbk_args              	= NULL; +        xlator_t *this                                  = NULL; +        gfdb_conn_node_t *conn_node                   	= NULL; +        dict_t *params_dict                             = NULL; +        _gfdb_brick_dict_info_t *gfdb_brick_dict_info   = args; + +        /*Init of all the essentials*/ +        GF_VALIDATE_OR_GOTO ("tier", gfdb_brick_dict_info , out); +        query_cbk_args = gfdb_brick_dict_info->_query_cbk_args; + +        GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out); +        this = query_cbk_args->this; + +        GF_VALIDATE_OR_GOTO (this->name, +                             gfdb_brick_dict_info->_query_cbk_args, out); + +        GF_VALIDATE_OR_GOTO (this->name, value, out); +        db_path = data_to_str(value); + +        /*Preparing DB parameters before init_db i.e getting db connection*/ +        params_dict = dict_new (); +        if (!params_dict) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        DHT_MSG_LOG_TIER_ERROR, +                        "DB Params cannot initialized!"); +                goto out; +        } +        SET_DB_PARAM_TO_DICT(this->name, params_dict, GFDB_SQL_PARAM_DBPATH, +                                db_path, ret, out); + +        /*Get the db connection*/ +        conn_node = init_db((void *)params_dict, dht_tier_db_type); +        if (!conn_node) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        DHT_MSG_LOG_TIER_ERROR, +                        "FATAL: Failed initializing db operations"); +                         goto out; +        } + +        /*Query for eligible files from db*/ +        query_cbk_args->queryFILE = fopen(GET_QFILE_PATH +                                (gfdb_brick_dict_info->_gfdb_promote), "a+"); +        if (!query_cbk_args->queryFILE) { +                gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, +                                "Failed to open query file %s:%s", +                                GET_QFILE_PATH +                        (gfdb_brick_dict_info->_gfdb_promote), +                        strerror(errno)); +                goto out; +        } +        if (!gfdb_brick_dict_info->_gfdb_promote) { +                if (query_cbk_args->defrag->write_freq_threshold == 0 && +                        query_cbk_args->defrag->read_freq_threshold == 0) { +                                ret = find_unchanged_for_time(conn_node, +                                        tier_gf_query_callback, +                                        (void *)query_cbk_args, +                                        gfdb_brick_dict_info->time_stamp); +                } else { +                                ret = find_unchanged_for_time_freq(conn_node, +                                        tier_gf_query_callback, +                                        (void *)query_cbk_args, +                                        gfdb_brick_dict_info->time_stamp, +                                        query_cbk_args->defrag-> +                                                        write_freq_threshold, +                                        query_cbk_args->defrag-> +                                                        read_freq_threshold, +                                        _gf_false); +                } +        } else { +                if (query_cbk_args->defrag->write_freq_threshold == 0 && +                        query_cbk_args->defrag->read_freq_threshold == 0) { +                        ret = find_recently_changed_files(conn_node, +                                        tier_gf_query_callback, +                                        (void *)query_cbk_args, +                                        gfdb_brick_dict_info->time_stamp); +                } else { +                        ret = find_recently_changed_files_freq(conn_node, +                                        tier_gf_query_callback, +                                        (void *)query_cbk_args, +                                        gfdb_brick_dict_info->time_stamp, +                                        query_cbk_args->defrag-> +                                                        write_freq_threshold, +                                        query_cbk_args->defrag-> +                                                        read_freq_threshold, +                                        _gf_false); +                } +        } +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                                DHT_MSG_LOG_TIER_ERROR, +                                "FATAL: query from db failed"); +                        goto out; +                } +        ret = 0; +out: +        if (query_cbk_args->queryFILE) { +                fclose (query_cbk_args->queryFILE); +                query_cbk_args->queryFILE = NULL; +        } +        fini_db (conn_node); +        return ret; +} + +inline int +tier_build_migration_qfile (demotion_args_t *args, +                            query_cbk_args_t *query_cbk_args, +                            gf_boolean_t is_promotion) +{ +        gfdb_time_t                     current_time; +        _gfdb_brick_dict_info_t         gfdb_brick_dict_info; +        gfdb_time_t                     time_in_past; +        int                             ret = -1; + +        remove (GET_QFILE_PATH (is_promotion)); +        time_in_past.tv_sec = args->freq_time; +        time_in_past.tv_usec = 0; +        if (gettimeofday (¤t_time, NULL) == -1) { +                gf_log (args->this->name, GF_LOG_ERROR, +                        "Failed to get current timen"); +                goto out; +        } +        time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; +        time_in_past.tv_usec = current_time.tv_usec - time_in_past.tv_usec; +        gfdb_brick_dict_info.time_stamp = &time_in_past; +        gfdb_brick_dict_info._gfdb_promote = is_promotion; +        gfdb_brick_dict_info._query_cbk_args = query_cbk_args; +        ret = dict_foreach (args->brick_list, tier_process_brick_cbk, +                            &gfdb_brick_dict_info); +        if (ret) { +                gf_log (args->this->name, GF_LOG_ERROR, +                        "Brick query failedn"); +                goto out; +        } +out: +        return ret; +} + +inline int +tier_migrate_files_using_qfile (demotion_args_t *comp, +                                query_cbk_args_t *query_cbk_args, +                                char *qfile) +{ +        char renamed_file[PATH_MAX] = ""; +        int ret = -1; + +        query_cbk_args->queryFILE = fopen (qfile, "r"); +        if (!query_cbk_args->queryFILE) { +                gf_log ("tier", GF_LOG_ERROR, +                        "Failed opening %s for migration", qfile); +                goto out; +        } +        ret = tier_migrate_using_query_file ((void *)query_cbk_args); +        fclose (query_cbk_args->queryFILE); +        query_cbk_args->queryFILE = NULL; +        if (ret) { +                sprintf (renamed_file, "%s.err", qfile); +                rename (qfile, renamed_file); +        } +out: +        return ret; +} + +/*Demotion Thread*/ +static void * +tier_demote (void *args) +{ +        int ret = -1; +        query_cbk_args_t query_cbk_args; +        demotion_args_t *demotion_args = args; + +        GF_VALIDATE_OR_GOTO ("tier", demotion_args, out); +        GF_VALIDATE_OR_GOTO ("tier", demotion_args->this, out); +        GF_VALIDATE_OR_GOTO (demotion_args->this->name, +                             demotion_args->brick_list, out); +        GF_VALIDATE_OR_GOTO (demotion_args->this->name, +                             demotion_args->defrag, out); + +        query_cbk_args.this = demotion_args->this; +        query_cbk_args.defrag = demotion_args->defrag; + +        /*Build the query file using bricklist*/ +        ret = tier_build_migration_qfile(demotion_args, &query_cbk_args, +                                    _gf_false); +        if (ret) +                goto out; + +        /* Migrate files using the query file */ +        ret = tier_migrate_files_using_qfile (args, +                                              &query_cbk_args, DEMOTION_QFILE); +        if (ret) +                goto out; + +out: +        demotion_args->return_value = ret; +        return NULL; +} + + +/*Promotion Thread*/ +static void +*tier_promote (void *args) +{ +        int ret = -1; +        query_cbk_args_t query_cbk_args; +        promotion_args_t *promotion_args = args; + +        GF_VALIDATE_OR_GOTO ("tier", promotion_args->this, out); +        GF_VALIDATE_OR_GOTO (promotion_args->this->name, +                             promotion_args->brick_list, out); +        GF_VALIDATE_OR_GOTO (promotion_args->this->name, +                             promotion_args->defrag, out); + +        query_cbk_args.this = promotion_args->this; +        query_cbk_args.defrag = promotion_args->defrag; + +        /*Build the query file using bricklist*/ +        ret = tier_build_migration_qfile(promotion_args, &query_cbk_args, +                                         _gf_true); +        if (ret) +                goto out; + +        /* Migrate files using the query file */ +        ret = tier_migrate_files_using_qfile (args, +                                              &query_cbk_args, +                                              PROMOTION_QFILE); +        if (ret) +                goto out; + +out: +        promotion_args->return_value = ret; +        return NULL; +} + +static void +tier_get_bricklist (xlator_t *xl, dict_t *bricklist) +{ +        xlator_list_t  *child = NULL; +        char           *rv        = NULL; +        char           *rh        = NULL; +        char           localhost[256] = {0}; +        char           *db_path = ""; +        char           *brickname = NULL; +        char            db_name[PATH_MAX] = ""; +        int             ret = 0; + +        GF_VALIDATE_OR_GOTO ("tier", xl, out); +        GF_VALIDATE_OR_GOTO ("tier", bricklist, out); + +        gethostname (localhost, sizeof (localhost)); + +        /* +         * This function obtains remote subvolumes and filters out only +         * those running on the same node as the tier daemon. +         */ +        if (strcmp(xl->type, "protocol/client") == 0) { +                ret = dict_get_str(xl->options, "remote-host", &rh); +                if (ret < 0) +                        goto out; + +               if (gf_is_local_addr (rh)) { + +                        ret = dict_get_str(xl->options, "remote-subvolume", +                                           &rv); +                        if (ret < 0) +                                goto out; +                        brickname = strrchr(rv, '/') + 1; +                        snprintf(db_name, sizeof(db_name), "%s.db", +                                 brickname); +                        db_path = GF_CALLOC (PATH_MAX, 1, gf_common_mt_char); +                        if (!db_path) { +                                gf_msg ("tier", GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_STATUS, +                                        "Failed to allocate memory for bricklist"); +                                goto out; +                        } + +                        sprintf(db_path, "%s/.glusterfs/%s", rv, db_name); +                        if (dict_add_dynstr_with_alloc(bricklist, "brick", +                                                       db_path)) +                                goto out; +                } +        } + +        for (child = xl->children; child; child = child->next) { +                tier_get_bricklist(child->xlator, bricklist); +        } +out: +        return; +} + +int +tier_start (xlator_t *this, gf_defrag_info_t *defrag) +{ +        dict_t       *bricklist_cold = NULL; +        dict_t       *bricklist_hot = NULL; +        dht_conf_t   *conf     = NULL; +        int tick = 0; +        int next_demote = 0; +        int next_promote = 0; +        int freq_promote = 0; +        int freq_demote = 0; +        promotion_args_t promotion_args = { 0 }; +        demotion_args_t demotion_args = { 0 }; +        int ret_promotion = 0; +        int ret_demotion = 0; +        int ret = 0; +        pthread_t promote_thread; +        pthread_t demote_thread; + +        conf   = this->private; + +        bricklist_cold = dict_new(); +        if (!bricklist_cold) +                return -1; + +        bricklist_hot = dict_new(); +        if (!bricklist_hot) +                return -1; + +        tier_get_bricklist (conf->subvolumes[0], bricklist_cold); +        tier_get_bricklist (conf->subvolumes[1], bricklist_hot); + +        freq_promote = defrag->tier_promote_frequency; +        freq_demote  = defrag->tier_demote_frequency; + +        next_promote = defrag->tier_promote_frequency % TIMER_SECS; +        next_demote  = defrag->tier_demote_frequency % TIMER_SECS; + + +        gf_msg (this->name, GF_LOG_INFO, 0, +                DHT_MSG_LOG_TIER_STATUS, "Begin run tier promote %d demote %d", +                next_promote, next_demote); + +        defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + +        while (1) { + +                sleep(1); + +                ret_promotion = -1; +                ret_demotion = -1; + +                if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { +                        ret = 1; +                        gf_msg (this->name, GF_LOG_ERROR, 0, +                                DHT_MSG_LOG_TIER_ERROR, +                                "defrag->defrag_status != " +                                "GF_DEFRAG_STATUS_STARTED"); +                        goto out; +                } + +                tick = (tick + 1) % TIMER_SECS; +                if ((next_demote != tick) && (next_promote != tick)) +                        continue; + +                if (next_demote >= tick) { +                        demotion_args.this = this; +                        demotion_args.brick_list = bricklist_hot; +                        demotion_args.defrag = defrag; +                        demotion_args.freq_time = freq_demote; +                        ret_demotion = pthread_create (&demote_thread, NULL, +                                        &tier_demote, &demotion_args); +                        if (ret_demotion) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, +                                        "Failed starting Demotion thread!"); +                        } +                        freq_demote = defrag->tier_demote_frequency; +                        next_demote = (tick + freq_demote) % TIMER_SECS; +                } + +                if (next_promote >= tick) { +                        promotion_args.this = this; +                        promotion_args.brick_list = bricklist_cold; +                        promotion_args.defrag = defrag; +                        promotion_args.freq_time = freq_promote; +                        ret_promotion = pthread_create (&promote_thread, NULL, +                                                &tier_promote, &promotion_args); +                        if (ret_promotion) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, +                                        "Failed starting Promotion thread!"); +                        } +                        freq_promote = defrag->tier_promote_frequency; +                        next_promote = (tick + freq_promote) % TIMER_SECS; +                } + +                if (ret_demotion == 0) { +                        pthread_join (demote_thread, NULL); +                        if (demotion_args.return_value) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, +                                        "Demotion failed!"); +                        } +                        ret_demotion = demotion_args.return_value; +                } + +                if (ret_promotion == 0) { +                        pthread_join (promote_thread, NULL); +                        if (promotion_args.return_value) { +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_LOG_TIER_ERROR, +                                        "Promotion failed!"); +                        } +                        ret_promotion = promotion_args.return_value; +                } + +                /*Collect previous and current cummulative status */ +                ret = ret | ret_demotion | ret_promotion; + +                /*reseting promotion and demotion arguments for next iteration*/ +                memset (&demotion_args, 0, sizeof(demotion_args_t)); +                memset (&promotion_args, 0, sizeof(promotion_args_t)); + +        } + +        ret = 0; +out: + +        dict_unref(bricklist_cold); +        dict_unref(bricklist_hot); + +        return ret; +} + +int32_t +tier_migration_needed (xlator_t *this) +{ +        gf_defrag_info_t        *defrag = NULL; +        dht_conf_t              *conf   = NULL; +        int                      ret = 0; + +        conf = this->private; + +        GF_VALIDATE_OR_GOTO (this->name, conf, out); +        GF_VALIDATE_OR_GOTO (this->name, conf->defrag, out); + +        defrag = conf->defrag; + +        if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) +                ret = 1; +out: +        return ret; +} + +int32_t +tier_migration_get_dst (xlator_t *this, dht_local_t *local) +{ +        dht_conf_t              *conf   = NULL; +        int32_t                  ret = -1; + +        GF_VALIDATE_OR_GOTO("tier", this, out); +        GF_VALIDATE_OR_GOTO(this->name, this->private, out); + +        conf = this->private; +        if (!conf) +                goto out; + +        if (conf->subvolumes[0] == local->cached_subvol) +                local->rebalance.target_node = +                        conf->subvolumes[1]; +        else +                local->rebalance.target_node = +                        conf->subvolumes[0]; + +        if (local->rebalance.target_node) +                ret = 0; + +out: +        return ret; +} + +xlator_t * +tier_search (xlator_t *this, dht_layout_t *layout, const char *name) +{ +        xlator_t  *subvol = NULL; +        void      *value; +        int        search_first_subvol = 0; + +        GF_VALIDATE_OR_GOTO("tier", this, out); +        GF_VALIDATE_OR_GOTO(this->name, layout, out); +        GF_VALIDATE_OR_GOTO(this->name, name, out); + +        if (!dict_get_ptr (this->options, "rule", &value) && +            !strcmp(layout->list[0].xlator->name, value)) { +                search_first_subvol = 1; +        } + +        if (search_first_subvol) +                subvol = layout->list[0].xlator; +        else +                subvol = layout->list[1].xlator; + +out: +        return subvol; +} + + +dht_methods_t tier_methods = { +        .migration_get_dst_subvol = tier_migration_get_dst, +        .migration_other = tier_start, +        .migration_needed = tier_migration_needed, +        .layout_search   = tier_search, +}; + + +int +tier_init (xlator_t *this) +{ +        int               ret            = -1; +        int               freq           = 0; +        dht_conf_t       *conf           = NULL; +        gf_defrag_info_t *defrag         = NULL; + +        ret = dht_init(this); +        if (ret) { +                gf_msg(this->name, GF_LOG_ERROR, 0, +                       DHT_MSG_LOG_TIER_ERROR, +                       "dht_init failed"); +                goto out; +        } + +        conf = this->private; + +        conf->methods = &tier_methods; + +        if (conf->subvolume_cnt != 2) { +                gf_msg(this->name, GF_LOG_ERROR, 0, +                       DHT_MSG_LOG_TIER_ERROR, +                       "Invalid number of subvolumes %d", conf->subvolume_cnt); +                goto out; +        } + +        /* if instatiated from client side initialization is complete. */ +        if (!conf->defrag) { +                ret = 0; +                goto out; +        } + +        defrag = conf->defrag; + +        GF_OPTION_INIT ("tier-promote-frequency", +                        defrag->tier_promote_frequency, +                        int32, out); + +        ret = dict_get_int32 (this->options, +                              "tier-promote-frequency", &freq); +        if (ret) { +                freq = DEFAULT_PROMOTE_FREQ_SEC; +        } + +        defrag->tier_promote_frequency = freq; + +        GF_OPTION_INIT ("tier-demote-frequency", +                        defrag->tier_demote_frequency, +                        int32, out); + +        ret = dict_get_int32 (this->options, +                              "tier-demote-frequency", &freq); +        if (ret) { +                freq = DEFAULT_DEMOTE_FREQ_SEC; +        } + +        defrag->tier_demote_frequency = freq; + +        GF_OPTION_INIT ("write-freq-threshold", +                        defrag->write_freq_threshold, +                        int32, out); + +        GF_OPTION_INIT ("read-freq-threshold", +                        defrag->read_freq_threshold, +                        int32, out); + +        gf_msg(this->name, GF_LOG_INFO, 0, +               DHT_MSG_LOG_TIER_STATUS, +               "Promote frequency set to %d demote set to %d", +               defrag->tier_promote_frequency, +               defrag->tier_demote_frequency); + +        ret = 0; + +out: +        return ret; +} + + +int +tier_reconfigure (xlator_t *this, dict_t *options) +{ +        dht_conf_t       *conf           = NULL; +        gf_defrag_info_t *defrag         = NULL; + +        conf = this->private; + +        if (conf->defrag) { +                defrag = conf->defrag; +                GF_OPTION_RECONF ("tier-promote-frequency", +                                  defrag->tier_promote_frequency, options, +                                  int32, out); + +                GF_OPTION_RECONF ("tier-demote-frequency", +                                  defrag->tier_demote_frequency, options, +                                  int32, out); + +                GF_OPTION_RECONF ("write-freq-threshold", +                                  defrag->write_freq_threshold, options, +                                  int32, out); + +                GF_OPTION_RECONF ("read-freq-threshold", +                                  defrag->read_freq_threshold, options, +                                  int32, out); +        } + +out: +        return dht_reconfigure (this, options); +} + +class_methods_t class_methods = { +        .init           = tier_init, +        .fini           = dht_fini, +        .reconfigure    = tier_reconfigure, +        .notify         = dht_notify +}; + + +struct xlator_fops fops = { +        .lookup      = dht_lookup, +        .create      = dht_create, +        .mknod       = dht_mknod, + +        .stat        = dht_stat, +        .fstat       = dht_fstat, +        .truncate    = dht_truncate, +        .ftruncate   = dht_ftruncate, +        .access      = dht_access, +        .readlink    = dht_readlink, +        .setxattr    = dht_setxattr, +        .getxattr    = dht_getxattr, +        .removexattr = dht_removexattr, +        .open        = dht_open, +        .readv       = dht_readv, +        .writev      = dht_writev, +        .flush       = dht_flush, +        .fsync       = dht_fsync, +        .statfs      = dht_statfs, +        .lk          = dht_lk, +        .opendir     = dht_opendir, +        .readdir     = dht_readdir, +        .readdirp    = dht_readdirp, +        .fsyncdir    = dht_fsyncdir, +        .symlink     = dht_symlink, +        .unlink      = dht_unlink, +        .link        = dht_link, +        .mkdir       = dht_mkdir, +        .rmdir       = dht_rmdir, +        .rename      = dht_rename, +        .inodelk     = dht_inodelk, +        .finodelk    = dht_finodelk, +        .entrylk     = dht_entrylk, +        .fentrylk    = dht_fentrylk, +        .xattrop     = dht_xattrop, +        .fxattrop    = dht_fxattrop, +        .setattr     = dht_setattr, +}; + + +struct xlator_cbks cbks = { +        .forget     = dht_forget +}; + diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h new file mode 100644 index 00000000000..73266050a5c --- /dev/null +++ b/xlators/cluster/dht/src/tier.h @@ -0,0 +1,71 @@ +/* +  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef _TIER_H_ +#define _TIER_H_ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +/******************************************************************************/ +/* This is from dht-rebalancer.c as we dont have dht-rebalancer.h */ +#include "dht-common.h" +#include "xlator.h" +#include <signal.h> +#include <fnmatch.h> +#include <signal.h> + +#define DEFAULT_PROMOTE_FREQ_SEC 120 +#define DEFAULT_DEMOTE_FREQ_SEC  120 + +/* + * Size of timer wheel. We would not promote or demote lesd + * frequently than this number. + */ +#define TIMER_SECS 3600 + +#include "gfdb_data_store.h" +#include <ctype.h> +#include <sys/xattr.h> +#include <sys/stat.h> + +#define DEMOTION_QFILE "/var/run/gluster/demotequeryfile" +#define PROMOTION_QFILE "/var/run/gluster/promotequeryfile" + +#define GET_QFILE_PATH(is_promotion)\ +        (is_promotion) ? PROMOTION_QFILE : DEMOTION_QFILE + +typedef struct _query_cbk_args { +        xlator_t *this; +        gf_defrag_info_t *defrag; +        FILE *queryFILE; +} query_cbk_args_t; + +int +gf_run_tier(xlator_t *this, gf_defrag_info_t *defrag); + +typedef struct _gfdb_brick_dict_info { +        gfdb_time_t           *time_stamp; +        gf_boolean_t            _gfdb_promote; +        query_cbk_args_t       *_query_cbk_args; +} _gfdb_brick_dict_info_t; + +typedef struct _dm_thread_args { +        xlator_t                *this; +        gf_defrag_info_t        *defrag; +        dict_t                  *brick_list; +        int                     freq_time; +        int                     return_value; +} promotion_args_t, demotion_args_t; + +#endif  | 
