diff options
| author | Diogenes Nunez <dnunez@redhat.com> | 2016-07-27 11:09:47 -0400 | 
|---|---|---|
| committer | Dan Lambright <dlambrig@redhat.com> | 2016-09-04 18:37:57 -0700 | 
| commit | 261c035c7d0cd1639cc8bd0ead82c30efcc0e93f (patch) | |
| tree | af3a2e498023e7ad8af417312b83ce2f969ef738 | |
| parent | 6459fc812219551291e4be426ed8ecf2c90813a4 (diff) | |
cluster/tier: Adding compaction option for metadata databases
Problem: As metadata in the database fills up, querying the database
take a long time. As a result, tier migration slows down.  To
counteract this, we added a way to enable the compaction methods of
the underlying database. The goal is to reduce the size of the
underlying file by eliminating database fragmentation.
NOTE: There is currently a bug where sometimes a brick will
attempt to activate compaction. This happens even compaction is already
turned on.
The cause is narrowed down to the compact_mode_switch flipping its value.
Changes: libglusterfs/src/gfdb - Added a gfdb function to compact the
underlying database, compact_db() This is a no-op if the database has
no such option.
- Added a compaction function for SQLite3 that does the following
1) Changes the auto_vacuum pragma of the database
2) Compacts the database according to the type of compaction requested
- Compaction type can be changed by changing the macro
  GF_SQL_COMPACT_DEF to one of the 4 compaction types in
  gfdb_sqlite3.h
  It is currently set to GF_SQL_COMPACT_INCR, or incremental
  vacuuming.
xlators/cluster/dht/src - Added the following command-line option to
enable SQLite3 compaction.
gluster volume set <vol-name> tier-compact <off|on>
- Added the following command-line option to change the frequency the
  hot and cold tier are ordered to compact.
gluster volume set <vol-name> tier-hot-compact-frequency <int>
gluster volume set <vol-name> tier-cold-compact-frequency <int>
- tier daemon periodically sends the (new)
  GFDB_IPC_CTR_SET_COMPACT_PRAGMA IPC to the CTR xlator. The IPC
  triggers compaction of the database.
  The inputs are both gf_boolean_t.
  IPC Input:
  compact_active: Is compaction currently on for the db.
  compact_mode_switched: Did we flip the compaction switch recently?
  IPC Output:
  0 if the compaction succeeds.
  Non-zero otherwise.
xlators/features/changetimerecorder/src/ - When the CTR gets the
compaction IPC, it launches a thread that will perform the
compaction. The IPC ends after the thread is launched. To avoid extra
allocations, the parameters are passed using static variables.
Change-Id: I5e1433becb9eeff2afe8dcb4a5798977bf5ba0dd
Signed-off-by: Diogenes Nunez <dnunez@redhat.com>
Reviewed-on: http://review.gluster.org/15031
Reviewed-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Tested-by: Dan Lambright <dlambrig@redhat.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
| -rw-r--r-- | libglusterfs/src/gfdb/gfdb_data_store.c | 40 | ||||
| -rw-r--r-- | libglusterfs/src/gfdb/gfdb_data_store.h | 19 | ||||
| -rw-r--r-- | libglusterfs/src/gfdb/gfdb_data_store_types.h | 50 | ||||
| -rw-r--r-- | libglusterfs/src/gfdb/gfdb_sqlite3.c | 187 | ||||
| -rw-r--r-- | libglusterfs/src/gfdb/gfdb_sqlite3.h | 21 | ||||
| -rw-r--r-- | libglusterfs/src/libglusterfs-messages.h | 23 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 31 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 26 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier.c | 554 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier.h | 6 | ||||
| -rw-r--r-- | xlators/features/changetimerecorder/src/changetimerecorder.c | 146 | ||||
| -rw-r--r-- | xlators/features/changetimerecorder/src/ctr-helper.h | 4 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-messages.h | 1 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 59 | 
14 files changed, 1057 insertions, 110 deletions
diff --git a/libglusterfs/src/gfdb/gfdb_data_store.c b/libglusterfs/src/gfdb/gfdb_data_store.c index 9c042f9e82e..cb567503fa3 100644 --- a/libglusterfs/src/gfdb/gfdb_data_store.c +++ b/libglusterfs/src/gfdb/gfdb_data_store.c @@ -433,6 +433,43 @@ delete_record (gfdb_conn_node_t *_conn_node,          return ret;  } +/*Libgfdb API Function: Compact the database. + * + * Arguments: + *      _conn_node                      :  GFDB Connection node + *      _compact_active                 :  Is compaction currently on? + *      _compact_mode_switched          :  Was the compaction switch flipped? + * Returns : if successful return 0 or + *          -ve value in case of failure*/ +int +compact_db (gfdb_conn_node_t *_conn_node, gf_boolean_t _compact_active, +            gf_boolean_t _compact_mode_switched) +{ +        int ret                                 = 0; +        gfdb_db_operations_t *db_operations_t   = NULL; +        void *gf_db_connection                  = NULL; + +        CHECK_CONN_NODE(_conn_node); + +        db_operations_t = &_conn_node->gfdb_connection.gfdb_db_operations; +        gf_db_connection = _conn_node->gfdb_connection.gf_db_connection; + +        if (db_operations_t->compact_db_op) { + +                ret = db_operations_t->compact_db_op (gf_db_connection, +                                                      _compact_active, +                                                      _compact_mode_switched); +                if (ret) { +                        gf_msg (GFDB_DATA_STORE, GF_LOG_ERROR, 0, +                                LG_MSG_COMPACT_FAILED, "Compaction operation " +                                "failed"); +                } + +        } + +        return ret; +} + @@ -835,5 +872,8 @@ void get_gfdb_methods (gfdb_methods_t *methods)          /* Link info related functions */          methods->gfdb_link_info_new = gfdb_link_info_new;          methods->gfdb_link_info_free = gfdb_link_info_free; + +        /* Compaction related functions */ +        methods->compact_db = compact_db;  } diff --git a/libglusterfs/src/gfdb/gfdb_data_store.h b/libglusterfs/src/gfdb/gfdb_data_store.h index eacb8527034..0aac4611153 100644 --- a/libglusterfs/src/gfdb/gfdb_data_store.h +++ b/libglusterfs/src/gfdb/gfdb_data_store.h @@ -31,7 +31,7 @@  #define GFDB_IPC_CTR_CLEAR_OPS "gfdb.ipc-ctr-clear-op"  #define GFDB_IPC_CTR_GET_DB_PARAM_OPS "gfdb.ipc-ctr-get-db-parm"  #define GFDB_IPC_CTR_GET_DB_VERSION_OPS "gfdb.ipc-ctr-get-db-version" - +#define GFDB_IPC_CTR_SET_COMPACT_PRAGMA "gfdb.ipc-ctr-set-compact-pragma"  /*   * CTR IPC INPUT/OUTPUT   * @@ -348,6 +348,21 @@ typedef int (*set_db_params_t)(gfdb_conn_node_t *db_conn,                                       char *param_key,                                       char *param_value); +/*Libgfdb API Function: Compact the database. + * + * Arguments: + *      _conn_node                      :  GFDB Connection node + *      _compact_active                 :  Is compaction currently on? + *      _compact_mode_switched          :  Was the compaction switch flipped? + * Returns : if successful return 0 or + *          -ve value in case of failure*/ +int +compact_db (gfdb_conn_node_t *_conn_node, gf_boolean_t _compact_active, +            gf_boolean_t _compact_mode_switched); + +typedef int (*compact_db_t)(gfdb_conn_node_t *db_conn, +                            gf_boolean_t compact_active, +                            gf_boolean_t compact_mode_switched);  typedef struct gfdb_methods_s { @@ -377,6 +392,8 @@ typedef struct gfdb_methods_s {          gfdb_link_info_new_t            gfdb_link_info_new;          gfdb_link_info_free_t           gfdb_link_info_free; +        /* Compaction related functions */ +        compact_db_t                    compact_db;  } gfdb_methods_t;  void get_gfdb_methods (gfdb_methods_t *methods); diff --git a/libglusterfs/src/gfdb/gfdb_data_store_types.h b/libglusterfs/src/gfdb/gfdb_data_store_types.h index 1acbdf2f99f..d0c96370eb8 100644 --- a/libglusterfs/src/gfdb/gfdb_data_store_types.h +++ b/libglusterfs/src/gfdb/gfdb_data_store_types.h @@ -40,7 +40,8 @@ typedef enum gf_db_operation {          GFDB_W_DELETE_DB_OP,          GFDB_UW_DELETE_DB_OP,          GFDB_WFC_UPDATE_DB_OP, -        GFDB_RFC_UPDATE_DB_OP +        GFDB_RFC_UPDATE_DB_OP, +        GFDB_DB_COMPACT_DB_OP /* Added for VACUUM/manual compaction support */  } gf_db_operation_t; @@ -81,19 +82,12 @@ gfdb_time_2_usec(gfdb_time_t *gfdb_time)          return ((uint64_t) gfdb_time->tv_sec * GFDB_MICROSEC) + gfdb_time->tv_usec;  } - - - -  /******************************************************************************   *   *              Insert/Update Record related data structures/functions   *   * ****************************************************************************/ - - -  /*Indicated a generic synchronous write to the db   * This may or may not be implemented*/  typedef enum gfdb_sync_type { @@ -123,11 +117,6 @@ out:          return ret;  } - - - - -  /*Indicated different types of db*/  typedef enum gfdb_db_type {          GFDB_INVALID_DB = -1, @@ -165,12 +154,6 @@ out:          return ret;  } - - - - - -  /*Tells the path of the fop*/  typedef enum gfdb_fop_path {          GFDB_FOP_INVALID = -1, @@ -206,12 +189,6 @@ isunwindpath(gfdb_fop_path_t gfdb_fop_path)          return (gfdb_fop_path >= GFDB_FOP_UNWIND) ? _gf_true : _gf_false;  } - - - - - -  /*Tell what type of fop it was   * Like whether a dentry fop or a inode fop   * Read fop or a write fop etc*/ @@ -258,12 +235,6 @@ isdentrycreatefop(gfdb_fop_type_t fop_type)                          _gf_true : _gf_false;  } - - - - - -  /*The structure that is used to send insert/update the databases   * using insert_db api*/  typedef struct gfdb_db_record { @@ -374,6 +345,20 @@ typedef int +/*Used to compact the database + * Arguments: + *      db_conn                        :  GFDB Connection node + *      compact_active                 :  Is compaction currently on? + *      compact_mode_switched          :  Was the compaction switch flipped? + * Returns : if successful return 0 or + *          -ve value in case of failure*/ +typedef int +(*gfdb_compact_db_t)(void *db_conn, gf_boolean_t compact_active, +                     gf_boolean_t compact_mode_switched); + + + +  /* Query all the records from the database   * Arguments:   *      db_conn        : plugin specific data base connection @@ -502,6 +487,7 @@ typedef struct gfdb_db_operations {          gfdb_fini_db_t                        fini_db_op;          gfdb_insert_record_t                  insert_record_op;          gfdb_delete_record_t                  delete_record_op; +        gfdb_compact_db_t                     compact_db_op;          gfdb_find_all_t                       find_all_op;          gfdb_find_unchanged_for_time_t        find_unchanged_for_time_op;          gfdb_find_recently_changed_files_t    find_recently_changed_files_op; @@ -598,5 +584,3 @@ typedef struct gfdb_connection {  #endif - - diff --git a/libglusterfs/src/gfdb/gfdb_sqlite3.c b/libglusterfs/src/gfdb/gfdb_sqlite3.c index 04781be562a..094028361c5 100644 --- a/libglusterfs/src/gfdb/gfdb_sqlite3.c +++ b/libglusterfs/src/gfdb/gfdb_sqlite3.c @@ -239,6 +239,7 @@ gf_sqlite3_fill_db_operations(gfdb_db_operations_t  *gfdb_db_ops)          gfdb_db_ops->insert_record_op = gf_sqlite3_insert;          gfdb_db_ops->delete_record_op = gf_sqlite3_delete; +        gfdb_db_ops->compact_db_op = gf_sqlite3_vacuum;          gfdb_db_ops->find_all_op = gf_sqlite3_find_all;          gfdb_db_ops->find_unchanged_for_time_op = @@ -1327,10 +1328,14 @@ gf_sqlite3_pragma (void *db_conn, char *pragma_key, char **pragma_value)                  goto out;          } -        ret = gf_asprintf (pragma_value, "%s", sqlite3_column_text (pre_stmt, 0)); -        if (ret <= 0) { -                gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, LG_MSG_QUERY_FAILED, -                        "Failed to get %s from db", pragma_key); +        if (pragma_value) { +                ret = gf_asprintf (pragma_value, "%s", +                                   sqlite3_column_text (pre_stmt, 0)); +                if (ret <= 0) { +                        gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, +                                LG_MSG_QUERY_FAILED, "Failed to get %s from db", +                                pragma_key); +                }          }          ret = 0; @@ -1382,3 +1387,177 @@ out:          return ret;  } + +/* Function to vacuum of sqlite db + * Input: + * void *db_conn                      : Sqlite connection + * gf_boolean_t compact_active        : Is compaction on? + * gf_boolean_t compact_mode_switched : Did we just flip the compaction swtich? + * Return: + *      On success return 0 + *      On failure return -1 + * */ +int +gf_sqlite3_vacuum (void *db_conn, gf_boolean_t compact_active, +                   gf_boolean_t compact_mode_switched) +{ +        int ret = -1; +        gf_sql_connection_t *sql_conn           =       db_conn; +        char *sqlstring = NULL; +        char *sql_strerror = NULL; +        gf_boolean_t changing_pragma = _gf_true; + +        CHECK_SQL_CONN (sql_conn, out); + +        if (GF_SQL_COMPACT_DEF == GF_SQL_COMPACT_NONE) { +                gf_msg (GFDB_STR_SQLITE3, GF_LOG_INFO, 0, +                        LG_MSG_COMPACT_STATUS, +                        "VACUUM type is off: no VACUUM to do"); +                goto out; +        } + +        if (compact_mode_switched) { +                if (compact_active) { /* Then it was OFF before. +                                        So turn everything on */ +                        ret = 0; +                        switch (GF_SQL_COMPACT_DEF) { +                        case GF_SQL_COMPACT_FULL: +                                ret = gf_sqlite3_set_pragma (db_conn, +                                                             "auto_vacuum", +                                                             GF_SQL_AV_FULL); +                                break; +                        case GF_SQL_COMPACT_INCR: +                                ret = gf_sqlite3_set_pragma (db_conn, +                                                             "auto_vacuum", +                                                             GF_SQL_AV_INCR); +                                break; +                        case GF_SQL_COMPACT_MANUAL: +                                changing_pragma = _gf_false; +                        default: +                                ret = -1; +                                gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, +                                        LG_MSG_COMPACT_FAILED, +                                        "VACUUM type undefined"); +                                goto out; +                                break; +                        } + +                } else { /* Then it was ON before, so turn it all off */ +                        if (GF_SQL_COMPACT_DEF == GF_SQL_COMPACT_FULL || +                           GF_SQL_COMPACT_DEF == GF_SQL_COMPACT_INCR) { +                                ret = gf_sqlite3_set_pragma (db_conn, +                                                             "auto_vacuum", +                                                             GF_SQL_AV_NONE); +                        } else { +                                changing_pragma = _gf_false; +                        } +                } + +                if (ret) { +                        gf_msg (GFDB_STR_SQLITE3, GF_LOG_TRACE, 0, +                                LG_MSG_PREPARE_FAILED, +                                "Failed to set the pragma"); +                        goto out; +                } + +                gf_msg (GFDB_STR_SQLITE3, GF_LOG_INFO, 0, +                        LG_MSG_COMPACT_STATUS, "Turning compaction %i", +                        GF_SQL_COMPACT_DEF); + +                /* If we move from an auto_vacuum scheme to off, */ +                /* or vice-versa, we must VACUUM to save the change. */ +                /* In the case of a manual VACUUM scheme, we might as well */ +                /* run a manual VACUUM now if we */ +                if (changing_pragma || compact_active) { +                        ret = gf_asprintf (&sqlstring, "VACUUM;"); +                        if (ret <= 0) { +                                gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, +                                        LG_MSG_PREPARE_FAILED, +                                        "Failed allocating memory"); +                                goto out; +                        } +                        gf_msg(GFDB_STR_SQLITE3, GF_LOG_INFO, 0, +                               LG_MSG_COMPACT_STATUS, "Sealed with a VACUUM"); +                } +        } else { /* We are active, so it's time to VACUUM */ +                if (!compact_active) { /* Did we somehow enter an inconsistent +                                          state? */ +                        ret = -1; +                        gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, +                                LG_MSG_PREPARE_FAILED, +                                "Tried to VACUUM when compaction inactive"); +                        goto out; +                } + +                gf_msg(GFDB_STR_SQLITE3, GF_LOG_TRACE, 0, +                       LG_MSG_COMPACT_STATUS, +                       "Doing regular vacuum of type %i", GF_SQL_COMPACT_DEF); + +                switch (GF_SQL_COMPACT_DEF) { +                case GF_SQL_COMPACT_INCR: /* INCR auto_vacuum */ +                        ret = gf_asprintf(&sqlstring, +                                          "PRAGMA incremental_vacuum;"); +                        if (ret <= 0) { +                                gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, +                                        LG_MSG_PREPARE_FAILED, +                                        "Failed allocating memory"); +                                goto out; +                        } +                        gf_msg(GFDB_STR_SQLITE3, GF_LOG_INFO, 0, +                               LG_MSG_COMPACT_STATUS, +                               "Will commence an incremental VACUUM"); +                        break; +                /* (MANUAL) Invoke the VACUUM command */ +                case GF_SQL_COMPACT_MANUAL: +                        ret = gf_asprintf(&sqlstring, "VACUUM;"); +                        if (ret <= 0) { +                                gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, +                                        LG_MSG_PREPARE_FAILED, +                                        "Failed allocating memory"); +                                goto out; +                        } +                        gf_msg(GFDB_STR_SQLITE3, GF_LOG_INFO, 0, +                               LG_MSG_COMPACT_STATUS, +                               "Will commence a VACUUM"); +                        break; +                /* (FULL) The database does the compaction itself. */ +                /* We cannot do anything else, so we can leave */ +                /* without sending anything to the database */ +                case GF_SQL_COMPACT_FULL: +                        ret = 0; +                        goto success; +                /* Any other state must be an error. Note that OFF */ +                /* cannot hit this statement since we immediately leave */ +                /* in that case */ +                default: +                        ret = -1; +                        gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, +                                LG_MSG_COMPACT_FAILED, +                                "VACUUM type undefined"); +                        goto out; +                        break; +                } +        } + +        gf_msg(GFDB_STR_SQLITE3, GF_LOG_TRACE, 0, LG_MSG_COMPACT_STATUS, +               "SQLString == %s", sqlstring); + +        ret = sqlite3_exec(sql_conn->sqlite3_db_conn, sqlstring, NULL, NULL, +                           &sql_strerror); + +        if (ret != SQLITE_OK) { +                gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, +                        LG_MSG_GET_RECORD_FAILED, "Failed to vacuum " +                        "the db : %s", sqlite3_errmsg (db_conn)); +                ret = -1; +                goto out; +        } +success: +        gf_msg(GFDB_STR_SQLITE3, GF_LOG_INFO, 0, LG_MSG_COMPACT_STATUS, +               compact_mode_switched ? "Successfully changed VACUUM on/off" +               : "DB successfully VACUUM"); +out: +        GF_FREE(sqlstring); + +        return ret; +} diff --git a/libglusterfs/src/gfdb/gfdb_sqlite3.h b/libglusterfs/src/gfdb/gfdb_sqlite3.h index 9d0d996a322..4d70a60e431 100644 --- a/libglusterfs/src/gfdb/gfdb_sqlite3.h +++ b/libglusterfs/src/gfdb/gfdb_sqlite3.h @@ -73,8 +73,7 @@ do {\  #define GF_SQL_AV_NONE  "none"  #define GF_SQL_AV_FULL  "full" -#define GF_SQL_AV_INCR  "incr" - +#define GF_SQL_AV_INCR  "incremental"  #define GF_SQL_SYNC_OFF         "off"  #define GF_SQL_SYNC_NORMAL      "normal" @@ -87,7 +86,12 @@ do {\  #define GF_SQL_JM_WAL           "wal"  #define GF_SQL_JM_OFF           "off" +#define GF_SQL_COMPACT_NONE   0 +#define GF_SQL_COMPACT_FULL   1 +#define GF_SQL_COMPACT_INCR   2 +#define GF_SQL_COMPACT_MANUAL 3 +#define GF_SQL_COMPACT_DEF    GF_SQL_COMPACT_INCR  typedef enum gf_sql_auto_vacuum {          gf_sql_av_none = 0,          gf_sql_av_full, @@ -319,7 +323,18 @@ int gf_sqlite3_pragma (void *db_conn, char *pragma_key, char **pragma_value);  int  gf_sqlite3_set_pragma (void *db_conn, char *pragma_key, char *pragma_value); - +/* Function to vacuum of sqlite db + * Input: + * void *db_conn                      : Sqlite connection + * gf_boolean_t compact_active        : Is compaction on? + * gf_boolean_t compact_mode_switched : Did we just flip the compaction swtich? + * Return: + *      On success return 0 + *      On failure return -1 + * */ +int +gf_sqlite3_vacuum (void *db_conn, gf_boolean_t compact_active, +                   gf_boolean_t compact_mode_switched);  void gf_sqlite3_fill_db_operations (gfdb_db_operations_t  *gfdb_db_ops); diff --git a/libglusterfs/src/libglusterfs-messages.h b/libglusterfs/src/libglusterfs-messages.h index d2ad44e470e..29196929eb3 100644 --- a/libglusterfs/src/libglusterfs-messages.h +++ b/libglusterfs/src/libglusterfs-messages.h @@ -36,7 +36,9 @@   */  #define GLFS_LG_BASE            GLFS_MSGID_COMP_LIBGLUSTERFS -#define GLFS_LG_NUM_MESSAGES    207 + +#define GLFS_LG_NUM_MESSAGES    209 +  #define GLFS_LG_MSGID_END       (GLFS_LG_BASE + GLFS_LG_NUM_MESSAGES + 1)  /* Messaged with message IDs */  #define glfs_msg_start_lg GLFS_LG_BASE, "Invalid: Start of messages" @@ -1762,6 +1764,7 @@   * @recommendedaction   *   */ +  #define LG_MSG_INVALID_INODE_LIST                         (GLFS_LG_BASE + 207)  /*! @@ -1770,6 +1773,24 @@   * @recommendedaction   *   */ + +#define LG_MSG_COMPACT_FAILED                            (GLFS_LG_BASE + 208) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + * + */ + +#define LG_MSG_COMPACT_STATUS                            (GLFS_LG_BASE + 209) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + * + */  /*------------*/  #define glfs_msg_end_lg GLFS_LG_MSGID_END, "Invalid: End of messages" diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 3717a68273c..da1bcb6a4a1 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -310,7 +310,7 @@ typedef struct dht_local dht_local_t;  /* du - disk-usage */  struct dht_du {          double   avail_percent; -	double   avail_inodes; +        double   avail_inodes;          uint64_t avail_space;          uint32_t log;          uint32_t chunks; @@ -325,10 +325,10 @@ enum gf_defrag_type {          GF_DEFRAG_CMD_START_FORCE = 1 + 4,          GF_DEFRAG_CMD_START_TIER = 1 + 5,          GF_DEFRAG_CMD_STATUS_TIER = 1 + 6, -	GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7, -	GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8, -	GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9, -	GF_DEFRAG_CMD_RESUME_TIER = 1 + 10, +        GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7, +        GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8, +        GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9, +        GF_DEFRAG_CMD_RESUME_TIER = 1 + 10,  };  typedef enum gf_defrag_type gf_defrag_type; @@ -398,9 +398,26 @@ typedef struct gf_tier_conf {          uint64_t                     max_migrate_bytes;          int                          max_migrate_files;          tier_mode_t                  mode; +        /* These flags are only used for tier-compact */ +        gf_boolean_t                 compact_active; +        /* These 3 flags are set to true when the client changes the */ +        /* compaction mode on the command line. */ +        /* When they are set, the daemon will trigger compaction as */ +        /* soon as possible to activate or deactivate compaction. */ +        /* If in the middle of a compaction, then the switches take */ +        /* effect on the next compaction, not the current one. */ +        /* If the user switches it off, we want to avoid needless */ +        /* compactions. */ +        /* If the user switches it on, they want to compact as soon */ +        /* as possible. */ +        gf_boolean_t                 compact_mode_switched; +        gf_boolean_t                 compact_mode_switched_hot; +        gf_boolean_t                 compact_mode_switched_cold;          int                          tier_max_promote_size;          int                          tier_promote_frequency;          int                          tier_demote_frequency; +        int                          tier_compact_hot_frequency; +        int                          tier_compact_cold_frequency;          uint64_t                     st_last_promoted_size;          uint64_t                     st_last_demoted_size;          tier_pause_state_t           pause_state; @@ -1023,9 +1040,9 @@ int32_t dht_setattr (call_frame_t  *frame, xlator_t *this, loc_t *loc,  int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,                        struct iatt  *stbuf, int32_t valid, dict_t *xdata);  int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, -		      int32_t mode, off_t offset, size_t len, dict_t *xdata); +                      int32_t mode, off_t offset, size_t len, dict_t *xdata);  int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, -		    off_t offset, size_t len, dict_t *xdata); +                    off_t offset, size_t len, dict_t *xdata);  int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,                      off_t offset, off_t len, dict_t *xdata);  int32_t dht_ipc (call_frame_t *frame, xlator_t *this, int32_t op, diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 873ced53eec..f410f71b5a6 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -148,7 +148,7 @@ dht_priv_dump (xlator_t *this)          gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);          gf_proc_dump_write("gen", "%d", conf->gen);          gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); -	gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); +        gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);          gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);          gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);          gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); @@ -433,14 +433,14 @@ dht_reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("lookup-optimize", conf->lookup_optimize, options,                            bool, out); -	GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, +        GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,                            percent_or_size, out);          /* option can be any one of percent or bytes */          conf->disk_unit = 0;          if (conf->min_free_disk < 100.0)                  conf->disk_unit = 'p'; -	GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, +        GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,                            percent, out);          GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, @@ -711,8 +711,8 @@ dht_init (xlator_t *this)          GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); -	GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, -			err); +        GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, +                        err);          GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,                          err); @@ -901,7 +901,7 @@ struct volume_options options[] = {            "process starts balancing out the cluster, and logs will appear "            "in log files",          }, -	{ .key  = {"min-free-inodes"}, +        { .key  = {"min-free-inodes"},            .type = GF_OPTION_TYPE_PERCENT,            .default_value = "5%",            .description = "after system has only N% of inodes, warnings " @@ -1038,6 +1038,20 @@ struct volume_options options[] = {            .type = GF_OPTION_TYPE_STR,            .default_value = "test",          }, +        { .key         = {"tier-compact"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +        }, +        { .key         = {"tier-hot-compact-frequency"}, +          .type = GF_OPTION_TYPE_INT, +          .default_value = "604800", +          .description = "Frequency to compact DBs on hot tier in system" +        }, +        { .key         = {"tier-cold-compact-frequency"}, +          .type = GF_OPTION_TYPE_INT, +          .default_value = "604800", +          .description = "Frequency to compact DBs on cold tier in system" +        },          { .key         = {"tier-max-mb"},            .type = GF_OPTION_TYPE_INT,            .default_value = "4000", diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index 0d53a62d327..7e5e1004b84 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -1240,15 +1240,15 @@ tier_process_ctr_query (tier_brick_list_t *local_brick, void *args)                  gfdb_brick_info->time_stamp,                  sizeof (gfdb_time_t)); -        SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, -                             GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_QUERY_OPS, -                             ret, out); +        SET_DB_PARAM_TO_DICT (this->name, ctr_ipc_in_dict, +                              GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_QUERY_OPS, +                              ret, out); -        SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, -                             GFDB_IPC_CTR_GET_QFILE_PATH, -                             local_brick->qfile_path, -                             ret, out); +        SET_DB_PARAM_TO_DICT (this->name, ctr_ipc_in_dict, +                              GFDB_IPC_CTR_GET_QFILE_PATH, +                              local_brick->qfile_path, +                              ret, out);          ret = dict_set_bin (ctr_ipc_in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS,                                  ipc_ctr_params, sizeof (*ipc_ctr_params)); @@ -1360,7 +1360,7 @@ tier_process_brick (tier_brick_list_t *local_brick, void *args) {                          gf_msg ("tier", GF_LOG_ERROR, 0,                                  LG_MSG_SET_PARAM_FAILED, "Failed to set %s "                                  "to params dictionary", -                                GFDB_IPC_CTR_GET_DB_KEY);\ +                                GFDB_IPC_CTR_GET_DB_KEY);                          goto out;                  } @@ -1442,11 +1442,12 @@ tier_build_migration_qfile (migration_args_t *args,          }          time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; -        /* The migration daemon may run a varrying numberof usec after the sleep */ -        /* call triggers. A file may be registered in CTR some number of usec X */ -        /* after the daemon started and missed in the subsequent cycle if the */ -        /* daemon starts Y usec after the period in seconds where Y>X. Normalize */ -        /* away this problem by always setting usec to 0. */ +        /* The migration daemon may run a varying numberof usec after the */ +        /* sleep call triggers. A file may be registered in CTR some number */ +        /* of usec X after the daemon started and missed in the subsequent */ +        /* cycle if the daemon starts Y usec after the period in seconds */ +        /* where Y>X. Normalize away this problem by always setting usec */ +        /* to 0. */          time_in_past.tv_usec = 0;          gfdb_brick_info.time_stamp = &time_in_past; @@ -1649,6 +1650,265 @@ out:          return ret;  } + +/* + * Command the CTR on a brick to compact the local database using an IPC + */ +static int +tier_process_self_compact (tier_brick_list_t *local_brick, void *args) +{ +        int ret                                         = -1; +        char *db_path                                   = NULL; +        query_cbk_args_t *query_cbk_args                = NULL; +        xlator_t *this                                  = NULL; +        gfdb_conn_node_t *conn_node                     = NULL; +        dict_t *params_dict                             = NULL; +        dict_t *ctr_ipc_dict                            = NULL; +        gfdb_brick_info_t *gfdb_brick_info              = args; +        int is_changing                                 = -1; + +        /*Init of all the essentials*/ +        GF_VALIDATE_OR_GOTO ("tier", gfdb_brick_info , out); +        query_cbk_args = gfdb_brick_info->_query_cbk_args; + +        GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out); +        this = query_cbk_args->this; + +        GF_VALIDATE_OR_GOTO (this->name, +                             gfdb_brick_info->_query_cbk_args, out); + +        GF_VALIDATE_OR_GOTO (this->name, local_brick, out); + +        GF_VALIDATE_OR_GOTO (this->name, local_brick->xlator, out); + +        GF_VALIDATE_OR_GOTO (this->name, local_brick->brick_db_path, out); + +        db_path = local_brick->brick_db_path; + +        /*Preparing DB parameters before init_db i.e getting db connection*/ +        params_dict = dict_new (); +        if (!params_dict) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        DHT_MSG_LOG_TIER_ERROR, +                        "DB Params cannot initialized"); +                goto out; +        } +        SET_DB_PARAM_TO_DICT (this->name, params_dict, +                             (char *) gfdb_methods.get_db_path_key(), db_path, +                             ret, out); + +        /*Get the db connection*/ +        conn_node = gfdb_methods.init_db ((void *)params_dict, +                                          dht_tier_db_type); +        if (!conn_node) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        DHT_MSG_LOG_TIER_ERROR, +                        "FATAL: Failed initializing db operations"); +                         goto out; +        } + +        ret = 0; + +        /*Preparing ctr_ipc_dict*/ +        ctr_ipc_dict = dict_new (); +        if (!ctr_ipc_dict) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        DHT_MSG_LOG_TIER_ERROR, +                        "ctr_ipc_dict cannot initialized"); +                goto out; +        } + +        ret = dict_set_int32 (ctr_ipc_dict, "compact_active", +                              query_cbk_args->defrag-> +                              tier_conf.compact_active); + +        if (ret) { +                gf_msg ("tier", GF_LOG_ERROR, 0, +                        LG_MSG_SET_PARAM_FAILED, "Failed to set %s " +                        "to params dictionary", +                        "compact_active"); +                goto out; +        } + +        ret = dict_set_int32 (ctr_ipc_dict, "compact_mode_switched", +                              query_cbk_args->defrag-> +                              tier_conf.compact_mode_switched); + +        if (ret) { +                gf_msg ("tier", GF_LOG_ERROR, 0, +                        LG_MSG_SET_PARAM_FAILED, "Failed to set %s " +                        "to params dictionary", +                        "compact_mode_switched"); +                goto out; +        } + +        SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, +                             GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_SET_COMPACT_PRAGMA, +                             ret, out); + +        gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, +                "Starting Compaction IPC"); + +        ret = syncop_ipc (local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict, +                          NULL); + +        gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, +                "Ending Compaction IPC"); + +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        DHT_MSG_LOG_TIER_ERROR, "Failed compaction " +                        "on db %s error %d", local_brick->brick_db_path, ret); +                goto out; +        } + +        gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, +                "SUCCESS: %s Compaction", local_brick->brick_name); + +        ret = 0; +out: +        if (params_dict) { +                dict_unref (params_dict); +                params_dict = NULL; +        } + +        if (ctr_ipc_dict) { +                dict_unref (ctr_ipc_dict); +                ctr_ipc_dict = NULL; +        } + +        gfdb_methods.fini_db (conn_node); + +        return ret; +} + +/* + * This is the call back function for each brick from hot/cold bricklist. + * It determines the database type on each brick and calls the corresponding + * function to prepare the compaction IPC. + */ +static int +tier_compact_db_brick (tier_brick_list_t *local_brick, void *args) +{ +        int ret = -1; +        char *strval = NULL; + +        GF_VALIDATE_OR_GOTO ("tier", local_brick, out); + +        GF_VALIDATE_OR_GOTO ("tier", local_brick->xlator, out); + +        ret = tier_process_self_compact (local_brick, args); +        if (ret) { +                gf_msg ("tier", GF_LOG_INFO, 0, +                        DHT_MSG_LOG_TIER_STATUS, +                        "Brick %s did not compact", +                        local_brick->brick_name); +                goto out; +        } + +        ret = 0; + +out: + +        return ret; +} + +static int +tier_send_compact (migration_args_t *args, +                   query_cbk_args_t *query_cbk_args) +{ +        gfdb_time_t               current_time; +        gfdb_brick_info_t         gfdb_brick_info; +        gfdb_time_t               time_in_past; +        int                       ret = -1; +        tier_brick_list_t         *local_brick = NULL; + +        time_in_past.tv_sec = args->freq_time; +        time_in_past.tv_usec = 0; + +        ret = gettimeofday (¤t_time, NULL); +        if (ret == -1) { +                gf_msg (args->this->name, GF_LOG_ERROR, errno, +                        DHT_MSG_SYS_CALL_GET_TIME_FAILED, +                        "Failed to get current time"); +                goto out; +        } +        time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; + +        /* The migration daemon may run a varying numberof usec after the sleep +           call triggers. A file may be registered in CTR some number of usec X +           after the daemon started and missed in the subsequent cycle if the +           daemon starts Y usec after the period in seconds where Y>X. Normalize +           away this problem by always setting usec to 0. */ +        time_in_past.tv_usec = 0; + +        gfdb_brick_info.time_stamp = &time_in_past; + +        /* This is meant to say we are always compacting at this point */ +        /* We simply borrow the promotion flag to do this */ +        gfdb_brick_info._gfdb_promote = 1; + +        gfdb_brick_info._query_cbk_args = query_cbk_args; + +        list_for_each_entry (local_brick, args->brick_list, list) { + +                gf_msg (args->this->name, GF_LOG_TRACE, 0, +                        DHT_MSG_LOG_TIER_STATUS, +                        "Start compaction for %s", +                        local_brick->brick_name); + +                ret = tier_compact_db_brick (local_brick, +                                             &gfdb_brick_info); +                if (ret) { +                        gf_msg (args->this->name, GF_LOG_ERROR, 0, +                                DHT_MSG_BRICK_QUERY_FAILED, +                                "Brick %s compaction failed\n", +                                local_brick->brick_db_path); +                } + +                gf_msg (args->this->name, GF_LOG_TRACE, 0, +                        DHT_MSG_LOG_TIER_STATUS, +                        "End compaction for %s", +                        local_brick->brick_name); + +        } +        ret = 0; +out: +        return ret; +} + +static int +tier_compact (void *args) +{ +        int ret = -1; +        query_cbk_args_t query_cbk_args; +        migration_args_t *compaction_args = args; + +        GF_VALIDATE_OR_GOTO ("tier", compaction_args->this, out); +        GF_VALIDATE_OR_GOTO (compaction_args->this->name, +                             compaction_args->brick_list, out); +        GF_VALIDATE_OR_GOTO (compaction_args->this->name, +                             compaction_args->defrag, out); + +        THIS = compaction_args->this; + +        query_cbk_args.this = compaction_args->this; +        query_cbk_args.defrag = compaction_args->defrag; +        query_cbk_args.is_compaction = 1; + +        /* Send the compaction pragma out to all the bricks on the bricklist. */ +        /* tier_get_bricklist ensures all bricks on the list are local to */ +        /* this node. */ +        ret = tier_send_compact (compaction_args, &query_cbk_args); +        if (ret) +                goto out; + +        ret = 0; +out: +        compaction_args->return_value = ret; +        return ret; + } +  static int  tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head)  { @@ -1755,6 +2015,18 @@ tier_get_freq_promote (gf_tier_conf_t *tier_conf)          return tier_conf->tier_promote_frequency;  } +int +tier_get_freq_compact_hot (gf_tier_conf_t *tier_conf) +{ +        return tier_conf->tier_compact_hot_frequency; +} + +int +tier_get_freq_compact_cold (gf_tier_conf_t *tier_conf) +{ +        return tier_conf->tier_compact_cold_frequency; +} +  static int  tier_check_demote (gfdb_time_t  current_time, int freq)  { @@ -1776,8 +2048,21 @@ tier_check_promote (gf_tier_conf_t   *tier_conf,                          _gf_true : _gf_false;  } +static gf_boolean_t +tier_check_compact (gf_tier_conf_t   *tier_conf, +                    gfdb_time_t  current_time, +                    int freq_compact) +{ + +        if (!(tier_conf->compact_active || +              tier_conf->compact_mode_switched)) +                return _gf_false; +        return ((current_time.tv_sec % freq_compact) == 0) ? +                _gf_true : _gf_false; +} +  void  clear_bricklist (struct list_head *brick_list) @@ -1824,6 +2109,72 @@ out:          return;  } +static int +tier_prepare_compact (migration_args_t *args, gfdb_time_t current_time) +{ +        xlator_t *this = NULL; +        dht_conf_t *conf                        = NULL; +        gf_defrag_info_t *defrag                = NULL; +        gf_tier_conf_t *tier_conf               = NULL; +        gf_boolean_t is_hot_tier = _gf_false; +        int freq = 0; +        int ret = -1; +        const char *tier_type = is_hot_tier ? "hot" : "cold"; + +        this = args->this; + +        conf = this->private; + +        defrag = conf->defrag; + +        tier_conf = &defrag->tier_conf; + +        is_hot_tier = args->is_hot_tier; + +        freq = is_hot_tier ? tier_get_freq_compact_hot (tier_conf) : +                tier_get_freq_compact_cold (tier_conf); + +        defrag->tier_conf.compact_mode_switched = is_hot_tier ? +                defrag->tier_conf.compact_mode_switched_hot : +                defrag->tier_conf.compact_mode_switched_cold; + +        gf_msg(this->name, GF_LOG_TRACE, 0, +               DHT_MSG_LOG_TIER_STATUS, +               "Compact mode %i", +               defrag->tier_conf.compact_mode_switched); + +        if (tier_check_compact (tier_conf, current_time, +                                freq)) { +                gf_msg (this->name, GF_LOG_INFO, 0, +                        DHT_MSG_LOG_TIER_STATUS, +                        "Start compaction on %s tier", +                        tier_type); + +                args->freq_time = freq; +                ret = tier_compact (args); +                if (ret) { +                        gf_msg (this->name, GF_LOG_ERROR, 0, +                                DHT_MSG_LOG_TIER_ERROR, "Compaction failed on " +                                "%s tier", tier_type); +                        goto out; +                } + +                gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, +                        "End compaction on %s tier", tier_type); + +                if (is_hot_tier) { +                        defrag->tier_conf.compact_mode_switched_hot = +                                _gf_false; +                } else { +                        defrag->tier_conf.compact_mode_switched_cold = +                                _gf_false; +                } +        } + +out: +        return ret; +} +  /*   * Main tiering loop. This is called from the promotion and the   * demotion threads spawned in tier_start(). @@ -1846,8 +2197,9 @@ static void          int check_watermark                     = 0;          gf_defrag_info_t *defrag                = NULL;          xlator_t  *this                         = NULL; +        struct list_head *bricklist_temp        = NULL;          migration_args_t *args = in_args; - +        gf_boolean_t compacted = _gf_false;          GF_VALIDATE_OR_GOTO ("tier", args, out);          GF_VALIDATE_OR_GOTO ("tier", args->brick_list, out); @@ -1884,7 +2236,8 @@ static void                  if (xlator != this) {                          gf_msg (this->name, GF_LOG_INFO, 0,                                  DHT_MSG_LOG_TIER_STATUS, -                                "Detected graph switch. Exiting migration daemon."); +                                "Detected graph switch. Exiting migration " +                                "daemon.");                          goto out;                  } @@ -1912,10 +2265,10 @@ static void                          goto out;                  } -                if (gf_defrag_get_pause_state (&defrag->tier_conf) != TIER_RUNNING) +                if (gf_defrag_get_pause_state (&defrag->tier_conf) != +                    TIER_RUNNING)                          continue; -                  /* To have proper synchronization amongst all                   * brick holding nodes, so that promotion and demotions                   * start atomicly w.r.t promotion/demotion frequency @@ -1950,7 +2303,6 @@ static void                  }                  if (args->is_promotion) { -                          freq = tier_get_freq_promote (tier_conf);                          if (tier_check_promote (tier_conf, current_time, freq)) { @@ -1962,21 +2314,22 @@ static void                                                  "Promotion failed");                                  }                          } - +                } else if (args->is_compaction) { +                        tier_prepare_compact (args, current_time);                  } else { -                          freq = tier_get_freq_demote (tier_conf);                          if (tier_check_demote (current_time, freq)) {                                  args->freq_time = freq;                                  ret = tier_demote (args);                                  if (ret) { -                                        gf_msg (this->name, GF_LOG_ERROR, 0, +                                        gf_msg (this->name, +                                                GF_LOG_ERROR, +                                                0,                                                  DHT_MSG_LOG_TIER_ERROR,                                                  "Demotion failed");                                  }                          } -                  }                  /* Check the statfs immediately after the processing threads @@ -1997,11 +2350,15 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)  {          pthread_t promote_thread;          pthread_t demote_thread; +        pthread_t hot_compact_thread; +        pthread_t cold_compact_thread;          int ret = -1;          struct list_head bricklist_hot          = { 0 };          struct list_head bricklist_cold         = { 0 };          migration_args_t promotion_args         = { 0 };          migration_args_t demotion_args          = { 0 }; +        migration_args_t hot_compaction_args    = { 0 }; +        migration_args_t cold_compaction_args   = { 0 };          dht_conf_t *conf                        = NULL;          INIT_LIST_HEAD ((&bricklist_hot)); @@ -2016,6 +2373,7 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)          demotion_args.brick_list = &bricklist_hot;          demotion_args.defrag = defrag;          demotion_args.is_promotion = _gf_false; +        demotion_args.is_compaction = _gf_false;          ret = pthread_create (&demote_thread,                                NULL, &tier_run, @@ -2047,6 +2405,47 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)                  goto waitforspawned;          } +        hot_compaction_args.this = this; +        hot_compaction_args.brick_list = &bricklist_hot; +        hot_compaction_args.defrag = defrag; +        hot_compaction_args.is_promotion = _gf_false; +        hot_compaction_args.is_compaction = _gf_true; +        hot_compaction_args.is_hot_tier = _gf_true; + +        ret = pthread_create (&hot_compact_thread, +                              NULL, &tier_run, +                              &hot_compaction_args); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        DHT_MSG_LOG_TIER_ERROR, +                        "Failed to start compaction thread."); +                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; +                goto waitforspawnedpromote; +        } + +        cold_compaction_args.this = this; +        cold_compaction_args.brick_list = &bricklist_cold; +        cold_compaction_args.defrag = defrag; +        cold_compaction_args.is_promotion = _gf_false; +        cold_compaction_args.is_compaction = _gf_true; +        cold_compaction_args.is_hot_tier = _gf_false; + +        ret = pthread_create (&cold_compact_thread, +                              NULL, &tier_run, +                              &cold_compaction_args); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        DHT_MSG_LOG_TIER_ERROR, +                        "Failed to start compaction thread."); +                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; +                goto waitforspawnedhotcompact; +        } +        pthread_join (cold_compact_thread, NULL); + +waitforspawnedhotcompact: +        pthread_join (hot_compact_thread, NULL); + +waitforspawnedpromote:          pthread_join (promote_thread, NULL);  waitforspawned: @@ -2055,7 +2454,6 @@ waitforspawned:  cleanup:          clear_bricklist (&bricklist_cold);          clear_bricklist (&bricklist_hot); -          return ret;  } @@ -2167,8 +2565,8 @@ out:          return ret;  } -static -int tier_validate_mode (char *mode) +static int +tier_validate_mode (char *mode)  {          int ret = -1; @@ -2181,6 +2579,26 @@ int tier_validate_mode (char *mode)          return ret;  } +static gf_boolean_t +tier_validate_compact_mode (char *mode) +{ +        gf_boolean_t ret = _gf_false; + +        gf_msg ("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, +                "tier_validate_compact_mode: mode = %s", mode); + +        if (!strcmp (mode, "on")) { +                ret = _gf_true; +        } else { +                ret = _gf_false; +        } + +        gf_msg ("tier", GF_LOG_ERROR, 0, +                DHT_MSG_LOG_TIER_STATUS, +                "tier_validate_compact_mode: ret = %i", ret); + +        return ret; +}  int  tier_init_methods (xlator_t *this) @@ -2205,8 +2623,6 @@ err:          return ret;  } - -  int  tier_init (xlator_t *this)  { @@ -2291,6 +2707,22 @@ tier_init (xlator_t *this)          defrag->tier_conf.tier_demote_frequency = freq;          ret = dict_get_int32 (this->options, +                              "tier-hot-compact-frequency", &freq); +        if (ret) { +                freq = DEFAULT_HOT_COMPACT_FREQ_SEC; +        } + +        defrag->tier_conf.tier_compact_hot_frequency = freq; + +        ret = dict_get_int32 (this->options, +                              "tier-cold-compact-frequency", &freq); +        if (ret) { +                freq = DEFAULT_COLD_COMPACT_FREQ_SEC; +        } + +        defrag->tier_conf.tier_compact_cold_frequency = freq; + +        ret = dict_get_int32 (this->options,                                "watermark-hi", &freq);          if (ret) {                  freq = DEFAULT_WM_HI; @@ -2339,6 +2771,29 @@ tier_init (xlator_t *this)          defrag->tier_conf.max_migrate_files = freq;          ret = dict_get_str (this->options, +                            "tier-compact", &mode); + +        if (ret) { +                defrag->tier_conf.compact_active = DEFAULT_COMP_MODE; +        } else { +                ret = tier_validate_compact_mode (mode); +                if (ret < 0) { +                        gf_msg(this->name, GF_LOG_ERROR, 0, +                               DHT_MSG_LOG_TIER_ERROR, +                               "tier_init failed - invalid compaction mode"); +                        goto out; +                } + +                /* If compaction is now active, we need to inform the bricks on +                   the hot and cold tier of this. See dht-common.h for more. */ +                defrag->tier_conf.compact_active = ret; +                if (ret) { +                        defrag->tier_conf.compact_mode_switched_hot = _gf_true; +                        defrag->tier_conf.compact_mode_switched_cold = _gf_true; +                } +        } + +        ret = dict_get_str (this->options,                              "tier-mode", &mode);          if (ret) {                  defrag->tier_conf.mode = DEFAULT_TIER_MODE; @@ -2361,7 +2816,8 @@ tier_init (xlator_t *this)                                "tier-pause", &paused);          if (paused && strcmp (paused, "on") == 0) -                gf_defrag_set_pause_state (&defrag->tier_conf, TIER_REQUEST_PAUSE); +                gf_defrag_set_pause_state (&defrag->tier_conf, +                                           TIER_REQUEST_PAUSE);          ret = gf_asprintf(&voldir, "%s/%s",                            DEFAULT_VAR_RUN_DIRECTORY, @@ -2411,7 +2867,6 @@ out:          return ret;  } -  int  tier_cli_pause_done (int op_ret, call_frame_t *sync_frame, void *data)  { @@ -2445,17 +2900,17 @@ exit:          return ret;  } -  int  tier_reconfigure (xlator_t *this, dict_t *options)  { -        dht_conf_t       *conf           = NULL; -        gf_defrag_info_t *defrag         = NULL; -        char             *mode           = NULL; -        int               migrate_mb     = 0; -        gf_boolean_t      req_pause      = _gf_false; -        int               ret            = 0; -        call_frame_t            *frame  = NULL; +        dht_conf_t       *conf            = NULL; +        gf_defrag_info_t *defrag          = NULL; +        char             *mode            = NULL; +        int               migrate_mb      = 0; +        gf_boolean_t      req_pause       = _gf_false; +        int               ret             = 0; +        call_frame_t            *frame    = NULL; +        gf_boolean_t last_compact_setting = _gf_false;          conf = this->private; @@ -2489,6 +2944,28 @@ tier_reconfigure (xlator_t *this, dict_t *options)                                    defrag->tier_conf.watermark_low, options,                                    int32, out); +                last_compact_setting = defrag->tier_conf.compact_active; + +                GF_OPTION_RECONF ("tier-compact", +                                  defrag->tier_conf.compact_active, options, +                                  bool, out); + +                if (last_compact_setting != defrag->tier_conf.compact_active) { +                        defrag->tier_conf.compact_mode_switched_hot = _gf_true; +                        defrag->tier_conf.compact_mode_switched_cold = _gf_true; +                        gf_msg (this->name, GF_LOG_INFO, 0, +                                DHT_MSG_LOG_TIER_STATUS, +                                "compact mode switched"); +                } + +                GF_OPTION_RECONF ("tier-hot-compact-frequency", +                                  defrag->tier_conf.tier_compact_hot_frequency, +                                  options, int32, out); + +                GF_OPTION_RECONF ("tier-cold-compact-frequency", +                                  defrag->tier_conf.tier_compact_cold_frequency, +                                  options, int32, out); +                  GF_OPTION_RECONF ("tier-mode",                                    mode, options,                                    str, out); @@ -2558,7 +3035,6 @@ class_methods_t class_methods = {          .notify         = dht_notify  }; -  struct xlator_fops fops = {          .lookup      = dht_lookup, @@ -2611,9 +3087,7 @@ struct xlator_fops fops = {          .zerofill    = dht_zerofill,  }; -  struct xlator_cbks cbks = {          .release    = dht_release,          .forget     = dht_forget  }; - diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h index 0807608fda2..ffb04173bd5 100644 --- a/xlators/cluster/dht/src/tier.h +++ b/xlators/cluster/dht/src/tier.h @@ -54,6 +54,7 @@ typedef struct _query_cbk_args {          /* This is write */          int                     query_fd;          int                     is_promotion; +        int                     is_compaction;          /* This is for read */          tier_qfile_array_t       *qfile_array;  } query_cbk_args_t; @@ -82,6 +83,8 @@ typedef struct _dm_thread_args {          int                     freq_time;          int                     return_value;          int                     is_promotion; +        int                     is_compaction; +        gf_boolean_t            is_hot_tier;  } migration_args_t;  typedef enum tier_watermark_op_ { @@ -93,12 +96,15 @@ typedef enum tier_watermark_op_ {  #define DEFAULT_PROMOTE_FREQ_SEC       120  #define DEFAULT_DEMOTE_FREQ_SEC        120 +#define DEFAULT_HOT_COMPACT_FREQ_SEC   604800 +#define DEFAULT_COLD_COMPACT_FREQ_SEC  604800  #define DEFAULT_DEMOTE_DEGRADED        10  #define DEFAULT_WRITE_FREQ_SEC         0  #define DEFAULT_READ_FREQ_SEC          0  #define DEFAULT_WM_LOW                 75  #define DEFAULT_WM_HI                  90  #define DEFAULT_TIER_MODE              TIER_MODE_TEST +#define DEFAULT_COMP_MODE              _gf_true  #define DEFAULT_TIER_MAX_MIGRATE_MB    1000  #define DEFAULT_TIER_MAX_MIGRATE_FILES 5000 diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.c b/xlators/features/changetimerecorder/src/changetimerecorder.c index 5f3a074acd5..933f496028c 100644 --- a/xlators/features/changetimerecorder/src/changetimerecorder.c +++ b/xlators/features/changetimerecorder/src/changetimerecorder.c @@ -15,6 +15,8 @@  #include "ctr-messages.h"  #include "syscall.h" +#include "changetimerecorder.h" +  /*******************************inode forget***********************************/  int @@ -1789,6 +1791,61 @@ out:          return ret;  } +void * +ctr_compact_thread (void *args) +{ +        int ret = -1; +        void *db_conn = NULL; + +        xlator_t *this = NULL; +        gf_ctr_private_t *priv = NULL; +        gf_boolean_t compact_active = _gf_false; +        gf_boolean_t compact_mode_switched = _gf_false; + +        this = (xlator_t *)args; + +        GF_VALIDATE_OR_GOTO("ctr", this, out); + +        priv = this->private; + +        db_conn = priv->_db_conn; +        compact_active = priv->compact_active; +        compact_mode_switched = priv->compact_mode_switched; + +        gf_msg ("ctr-compact", GF_LOG_INFO, 0, CTR_MSG_SET, +                "Starting compaction"); + +        ret = compact_db(db_conn, compact_active, +                         compact_mode_switched); + +        if (ret) { +                gf_msg ("ctr-compact", GF_LOG_ERROR, 0, CTR_MSG_SET, +                        "Failed to perform the compaction"); +        } + +        ret = pthread_mutex_lock (&priv->compact_lock); + +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, +                        "Failed to acquire lock"); +                goto out; +        } + +        /* We are done compaction on this brick. Set all flags to false */ +        priv->compact_active = _gf_false; +        priv->compact_mode_switched = _gf_false; + +        ret = pthread_mutex_unlock (&priv->compact_lock); + +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, +                        "Failed to release lock"); +                goto out; +        } + +out: +        return NULL; +}  int  ctr_ipc_helper (xlator_t *this, dict_t *in_dict, @@ -1802,7 +1859,8 @@ ctr_ipc_helper (xlator_t *this, dict_t *in_dict,          char *db_param = NULL;          char *query_file = NULL;          gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL; - +        int result = 0; +        pthread_t compact_thread;          GF_VALIDATE_OR_GOTO ("ctr", this, out);          GF_VALIDATE_OR_GOTO (this->name, this->private, out); @@ -1888,12 +1946,78 @@ ctr_ipc_helper (xlator_t *this, dict_t *in_dict,                  SET_DB_PARAM_TO_DICT(this->name, out_dict,                                          db_param_key,                                          db_param, ret, error); +        } /* if its an attempt to compact the database */ +        else if (strncmp (ctr_ipc_ops, GFDB_IPC_CTR_SET_COMPACT_PRAGMA, +                          strlen (GFDB_IPC_CTR_SET_COMPACT_PRAGMA)) == 0) { + +                ret = pthread_mutex_lock (&priv->compact_lock); +                if (ret) { +                        gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, +                                "Failed to acquire lock for compaction"); +                        goto out; +                } + +                if ((priv->compact_active || priv->compact_mode_switched)) { +                        /* Compaction in progress. LEAVE */ +                        gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, +                               "Compaction already in progress."); +                        pthread_mutex_unlock (&priv->compact_lock); +                        goto out; +                } +                /* At this point, we should be the only one on the brick */ +                /* compacting */ + +                /* Grab the arguments from the dictionary */ +                ret = dict_get_int32 (in_dict, "compact_active", &result); +                if (ret) { +                        gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, +                               "Failed to get compaction type"); +                        goto out; +                } + +                if (result) { +                        priv->compact_active = _gf_true; +                } + +                ret = dict_get_int32 (in_dict, "compact_mode_switched" +                                     , &result); +                if (ret) { +                        gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, +                               "Failed to see if compaction switched"); +                        goto out; +                } + +                if (result) { +                        priv->compact_mode_switched = _gf_true; +                        gf_msg ("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET, +                                "Pre-thread: Compact mode switch is true"); +                } else { +                        gf_msg ("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET, +                                "Pre-thread: Compact mode switch is false"); +                } + +                ret = pthread_mutex_unlock (&priv->compact_lock); +                if (ret) { +                        gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, +                                "Failed to release lock for compaction"); +                        goto out; +                } + +                ret = pthread_create (&compact_thread, NULL, ctr_compact_thread, +                                      (void *)this); + +                if (ret) { +                        gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, +                                "Failed to spawn compaction thread"); +                        goto out; +                } + +                goto out;          } /* default case */          else {                  goto out;          } -          ret = 0;          goto out;  error: @@ -2079,6 +2203,18 @@ init (xlator_t *this)          priv->ctr_lookupheal_inode_timeout =                                  CTR_DEFAULT_INODE_EXP_PERIOD; +        /* For compaction */ +        priv->compact_active = _gf_false; +        priv->compact_mode_switched = _gf_false; +        ret_db = pthread_mutex_init (&priv->compact_lock, NULL); + +        if (ret_db) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        CTR_MSG_FATAL_ERROR, +                        "FATAL: Failed initializing compaction mutex"); +                goto error; +        } +          /*Extract ctr xlator options*/          ret_db = extract_ctr_options (this, priv);          if (ret_db) { @@ -2123,6 +2259,7 @@ init (xlator_t *this)                          goto error;          } +          ret_db = 0;          goto out; @@ -2185,6 +2322,11 @@ fini (xlator_t *this)                                  "db connection");                  }                  GF_FREE (priv->ctr_db_path); +                if (pthread_mutex_destroy (&priv->compact_lock)) { +                        gf_msg (this->name, GF_LOG_WARNING, 0, +                                CTR_MSG_CLOSE_DB_CONN_FAILED, "Failed to " +                                "destroy the compaction mutex"); +                }          }          GF_FREE (priv);          mem_pool_destroy (this->local_pool); diff --git a/xlators/features/changetimerecorder/src/ctr-helper.h b/xlators/features/changetimerecorder/src/ctr-helper.h index d5615270184..4fd4f745f4d 100644 --- a/xlators/features/changetimerecorder/src/ctr-helper.h +++ b/xlators/features/changetimerecorder/src/ctr-helper.h @@ -22,6 +22,7 @@  #include "common-utils.h"  #include <time.h>  #include <sys/time.h> +#include <pthread.h>  #include "gfdb_data_store.h"  #include "ctr-xlator-ctx.h" @@ -52,6 +53,9 @@ typedef struct gf_ctr_private {          gfdb_conn_node_t                *_db_conn;          uint64_t                        ctr_lookupheal_link_timeout;          uint64_t                        ctr_lookupheal_inode_timeout; +        gf_boolean_t                    compact_active; +        gf_boolean_t                    compact_mode_switched; +        pthread_mutex_t                 compact_lock;  } gf_ctr_private_t; diff --git a/xlators/mgmt/glusterd/src/glusterd-messages.h b/xlators/mgmt/glusterd/src/glusterd-messages.h index e520c69add2..2ba1876b6ec 100644 --- a/xlators/mgmt/glusterd/src/glusterd-messages.h +++ b/xlators/mgmt/glusterd/src/glusterd-messages.h @@ -4755,4 +4755,3 @@  /*------------*/  #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"  #endif /* !_GLUSTERD_MESSAGES_H_ */ - diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index ce34ffd2b05..d87082e9e89 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -387,6 +387,14 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,                          goto out;                  }                  goto out; +        } else if (strstr (key, "tier-compact")) { +                if (strcmp (value, "on") && +                    strcmp (value, "off")) { +                        ret = -1; +                        goto out; +                } + +                goto out;          }          /* @@ -452,7 +460,9 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,                     strstr (key, "tier-max-mb") ||                     strstr (key, "tier-max-promote-file-size") ||                     strstr (key, "tier-max-files") || -                   strstr (key, "tier-demote-frequency")) { +                   strstr (key, "tier-demote-frequency") || +                   strstr (key, "tier-hot-compact-frequency") || +                   strstr (key, "tier-cold-compact-frequency")) {                  if (origin_val < 1) {                          snprintf (errstr, sizeof (errstr), "%s is not a "                                    " compatible value. %s expects a positive " @@ -464,7 +474,6 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,                          ret = -1;                          goto out;                  } -          }  out:          gf_msg_debug (this->name, 0, "Returning %d", ret); @@ -1589,17 +1598,17 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .flags      = OPT_FLAG_CLIENT_OPT          }, - 	/* Crypt xlator options */ +         /* Crypt xlator options */ -	{ .key         = "features.encryption", -	  .voltype     = "encryption/crypt", -	  .option      = "!feat", -	  .value       = "off", -	  .op_version  = 3, -	  .description = "enable/disable client-side encryption for " +        { .key         = "features.encryption", +          .voltype     = "encryption/crypt", +          .option      = "!feat", +          .value       = "off", +          .op_version  = 3, +          .description = "enable/disable client-side encryption for "                           "the volume.", -	  .flags       = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT -	}, +          .flags       = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT +        },          { .key         = "encryption.master-key",            .voltype     = "encryption/crypt", @@ -1968,7 +1977,7 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .flags      = OPT_FLAG_CLIENT_OPT          }, -	/* Feature translators */ +        /* Feature translators */          { .key         = "features.uss",            .voltype     = "features/snapview-server",            .op_version  = GD_OP_VERSION_3_6_0, @@ -2730,6 +2739,32 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .description = "The maximum number of files that may be migrated"            " in any direction in a given cycle by a single node."          }, +        { .key         = "cluster.tier-compact", +          .voltype     = "cluster/tier", +          .option      = "tier-compact", +          .value       = "on", +          .op_version  = GD_OP_VERSION_3_9_0, +          .flags       = OPT_FLAG_CLIENT_OPT, +          .validate_fn = validate_tier, +          .description = "Activate or deactivate the compaction of the DB" +          " for the volume's metadata." +        }, +        { .key         = "cluster.tier-hot-compact-frequency", +          .voltype     = "cluster/tier", +          .value       = "604800", +          .option      = "tier-hot-compact-frequency", +          .op_version  = GD_OP_VERSION_3_9_0, +          .flags       = OPT_FLAG_CLIENT_OPT, +          .validate_fn = validate_tier, +        }, +        { .key         = "cluster.tier-cold-compact-frequency", +          .voltype     = "cluster/tier", +          .value       = "604800", +          .option      = "tier-cold-compact-frequency", +          .op_version  = GD_OP_VERSION_3_9_0, +          .flags       = OPT_FLAG_CLIENT_OPT, +          .validate_fn = validate_tier, +        },          { .key         = "features.ctr-enabled",            .voltype     = "features/changetimerecorder",            .value       = "off",  | 
