diff options
author | Diogenes Nunez <dnunez@redhat.com> | 2016-07-27 11:09:47 -0400 |
---|---|---|
committer | Dan Lambright <dlambrig@redhat.com> | 2016-09-04 18:37:57 -0700 |
commit | 261c035c7d0cd1639cc8bd0ead82c30efcc0e93f (patch) | |
tree | af3a2e498023e7ad8af417312b83ce2f969ef738 | |
parent | 6459fc812219551291e4be426ed8ecf2c90813a4 (diff) |
cluster/tier: Adding compaction option for metadata databases
Problem: As metadata in the database fills up, querying the database
take a long time. As a result, tier migration slows down. To
counteract this, we added a way to enable the compaction methods of
the underlying database. The goal is to reduce the size of the
underlying file by eliminating database fragmentation.
NOTE: There is currently a bug where sometimes a brick will
attempt to activate compaction. This happens even compaction is already
turned on.
The cause is narrowed down to the compact_mode_switch flipping its value.
Changes: libglusterfs/src/gfdb - Added a gfdb function to compact the
underlying database, compact_db() This is a no-op if the database has
no such option.
- Added a compaction function for SQLite3 that does the following
1) Changes the auto_vacuum pragma of the database
2) Compacts the database according to the type of compaction requested
- Compaction type can be changed by changing the macro
GF_SQL_COMPACT_DEF to one of the 4 compaction types in
gfdb_sqlite3.h
It is currently set to GF_SQL_COMPACT_INCR, or incremental
vacuuming.
xlators/cluster/dht/src - Added the following command-line option to
enable SQLite3 compaction.
gluster volume set <vol-name> tier-compact <off|on>
- Added the following command-line option to change the frequency the
hot and cold tier are ordered to compact.
gluster volume set <vol-name> tier-hot-compact-frequency <int>
gluster volume set <vol-name> tier-cold-compact-frequency <int>
- tier daemon periodically sends the (new)
GFDB_IPC_CTR_SET_COMPACT_PRAGMA IPC to the CTR xlator. The IPC
triggers compaction of the database.
The inputs are both gf_boolean_t.
IPC Input:
compact_active: Is compaction currently on for the db.
compact_mode_switched: Did we flip the compaction switch recently?
IPC Output:
0 if the compaction succeeds.
Non-zero otherwise.
xlators/features/changetimerecorder/src/ - When the CTR gets the
compaction IPC, it launches a thread that will perform the
compaction. The IPC ends after the thread is launched. To avoid extra
allocations, the parameters are passed using static variables.
Change-Id: I5e1433becb9eeff2afe8dcb4a5798977bf5ba0dd
Signed-off-by: Diogenes Nunez <dnunez@redhat.com>
Reviewed-on: http://review.gluster.org/15031
Reviewed-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Tested-by: Dan Lambright <dlambrig@redhat.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r-- | libglusterfs/src/gfdb/gfdb_data_store.c | 40 | ||||
-rw-r--r-- | libglusterfs/src/gfdb/gfdb_data_store.h | 19 | ||||
-rw-r--r-- | libglusterfs/src/gfdb/gfdb_data_store_types.h | 50 | ||||
-rw-r--r-- | libglusterfs/src/gfdb/gfdb_sqlite3.c | 187 | ||||
-rw-r--r-- | libglusterfs/src/gfdb/gfdb_sqlite3.h | 21 | ||||
-rw-r--r-- | libglusterfs/src/libglusterfs-messages.h | 23 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 31 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 26 | ||||
-rw-r--r-- | xlators/cluster/dht/src/tier.c | 554 | ||||
-rw-r--r-- | xlators/cluster/dht/src/tier.h | 6 | ||||
-rw-r--r-- | xlators/features/changetimerecorder/src/changetimerecorder.c | 146 | ||||
-rw-r--r-- | xlators/features/changetimerecorder/src/ctr-helper.h | 4 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-messages.h | 1 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 59 |
14 files changed, 1057 insertions, 110 deletions
diff --git a/libglusterfs/src/gfdb/gfdb_data_store.c b/libglusterfs/src/gfdb/gfdb_data_store.c index 9c042f9e82e..cb567503fa3 100644 --- a/libglusterfs/src/gfdb/gfdb_data_store.c +++ b/libglusterfs/src/gfdb/gfdb_data_store.c @@ -433,6 +433,43 @@ delete_record (gfdb_conn_node_t *_conn_node, return ret; } +/*Libgfdb API Function: Compact the database. + * + * Arguments: + * _conn_node : GFDB Connection node + * _compact_active : Is compaction currently on? + * _compact_mode_switched : Was the compaction switch flipped? + * Returns : if successful return 0 or + * -ve value in case of failure*/ +int +compact_db (gfdb_conn_node_t *_conn_node, gf_boolean_t _compact_active, + gf_boolean_t _compact_mode_switched) +{ + int ret = 0; + gfdb_db_operations_t *db_operations_t = NULL; + void *gf_db_connection = NULL; + + CHECK_CONN_NODE(_conn_node); + + db_operations_t = &_conn_node->gfdb_connection.gfdb_db_operations; + gf_db_connection = _conn_node->gfdb_connection.gf_db_connection; + + if (db_operations_t->compact_db_op) { + + ret = db_operations_t->compact_db_op (gf_db_connection, + _compact_active, + _compact_mode_switched); + if (ret) { + gf_msg (GFDB_DATA_STORE, GF_LOG_ERROR, 0, + LG_MSG_COMPACT_FAILED, "Compaction operation " + "failed"); + } + + } + + return ret; +} + @@ -835,5 +872,8 @@ void get_gfdb_methods (gfdb_methods_t *methods) /* Link info related functions */ methods->gfdb_link_info_new = gfdb_link_info_new; methods->gfdb_link_info_free = gfdb_link_info_free; + + /* Compaction related functions */ + methods->compact_db = compact_db; } diff --git a/libglusterfs/src/gfdb/gfdb_data_store.h b/libglusterfs/src/gfdb/gfdb_data_store.h index eacb8527034..0aac4611153 100644 --- a/libglusterfs/src/gfdb/gfdb_data_store.h +++ b/libglusterfs/src/gfdb/gfdb_data_store.h @@ -31,7 +31,7 @@ #define GFDB_IPC_CTR_CLEAR_OPS "gfdb.ipc-ctr-clear-op" #define GFDB_IPC_CTR_GET_DB_PARAM_OPS "gfdb.ipc-ctr-get-db-parm" #define GFDB_IPC_CTR_GET_DB_VERSION_OPS "gfdb.ipc-ctr-get-db-version" - +#define GFDB_IPC_CTR_SET_COMPACT_PRAGMA "gfdb.ipc-ctr-set-compact-pragma" /* * CTR IPC INPUT/OUTPUT * @@ -348,6 +348,21 @@ typedef int (*set_db_params_t)(gfdb_conn_node_t *db_conn, char *param_key, char *param_value); +/*Libgfdb API Function: Compact the database. + * + * Arguments: + * _conn_node : GFDB Connection node + * _compact_active : Is compaction currently on? + * _compact_mode_switched : Was the compaction switch flipped? + * Returns : if successful return 0 or + * -ve value in case of failure*/ +int +compact_db (gfdb_conn_node_t *_conn_node, gf_boolean_t _compact_active, + gf_boolean_t _compact_mode_switched); + +typedef int (*compact_db_t)(gfdb_conn_node_t *db_conn, + gf_boolean_t compact_active, + gf_boolean_t compact_mode_switched); typedef struct gfdb_methods_s { @@ -377,6 +392,8 @@ typedef struct gfdb_methods_s { gfdb_link_info_new_t gfdb_link_info_new; gfdb_link_info_free_t gfdb_link_info_free; + /* Compaction related functions */ + compact_db_t compact_db; } gfdb_methods_t; void get_gfdb_methods (gfdb_methods_t *methods); diff --git a/libglusterfs/src/gfdb/gfdb_data_store_types.h b/libglusterfs/src/gfdb/gfdb_data_store_types.h index 1acbdf2f99f..d0c96370eb8 100644 --- a/libglusterfs/src/gfdb/gfdb_data_store_types.h +++ b/libglusterfs/src/gfdb/gfdb_data_store_types.h @@ -40,7 +40,8 @@ typedef enum gf_db_operation { GFDB_W_DELETE_DB_OP, GFDB_UW_DELETE_DB_OP, GFDB_WFC_UPDATE_DB_OP, - GFDB_RFC_UPDATE_DB_OP + GFDB_RFC_UPDATE_DB_OP, + GFDB_DB_COMPACT_DB_OP /* Added for VACUUM/manual compaction support */ } gf_db_operation_t; @@ -81,19 +82,12 @@ gfdb_time_2_usec(gfdb_time_t *gfdb_time) return ((uint64_t) gfdb_time->tv_sec * GFDB_MICROSEC) + gfdb_time->tv_usec; } - - - - /****************************************************************************** * * Insert/Update Record related data structures/functions * * ****************************************************************************/ - - - /*Indicated a generic synchronous write to the db * This may or may not be implemented*/ typedef enum gfdb_sync_type { @@ -123,11 +117,6 @@ out: return ret; } - - - - - /*Indicated different types of db*/ typedef enum gfdb_db_type { GFDB_INVALID_DB = -1, @@ -165,12 +154,6 @@ out: return ret; } - - - - - - /*Tells the path of the fop*/ typedef enum gfdb_fop_path { GFDB_FOP_INVALID = -1, @@ -206,12 +189,6 @@ isunwindpath(gfdb_fop_path_t gfdb_fop_path) return (gfdb_fop_path >= GFDB_FOP_UNWIND) ? _gf_true : _gf_false; } - - - - - - /*Tell what type of fop it was * Like whether a dentry fop or a inode fop * Read fop or a write fop etc*/ @@ -258,12 +235,6 @@ isdentrycreatefop(gfdb_fop_type_t fop_type) _gf_true : _gf_false; } - - - - - - /*The structure that is used to send insert/update the databases * using insert_db api*/ typedef struct gfdb_db_record { @@ -374,6 +345,20 @@ typedef int +/*Used to compact the database + * Arguments: + * db_conn : GFDB Connection node + * compact_active : Is compaction currently on? + * compact_mode_switched : Was the compaction switch flipped? + * Returns : if successful return 0 or + * -ve value in case of failure*/ +typedef int +(*gfdb_compact_db_t)(void *db_conn, gf_boolean_t compact_active, + gf_boolean_t compact_mode_switched); + + + + /* Query all the records from the database * Arguments: * db_conn : plugin specific data base connection @@ -502,6 +487,7 @@ typedef struct gfdb_db_operations { gfdb_fini_db_t fini_db_op; gfdb_insert_record_t insert_record_op; gfdb_delete_record_t delete_record_op; + gfdb_compact_db_t compact_db_op; gfdb_find_all_t find_all_op; gfdb_find_unchanged_for_time_t find_unchanged_for_time_op; gfdb_find_recently_changed_files_t find_recently_changed_files_op; @@ -598,5 +584,3 @@ typedef struct gfdb_connection { #endif - - diff --git a/libglusterfs/src/gfdb/gfdb_sqlite3.c b/libglusterfs/src/gfdb/gfdb_sqlite3.c index 04781be562a..094028361c5 100644 --- a/libglusterfs/src/gfdb/gfdb_sqlite3.c +++ b/libglusterfs/src/gfdb/gfdb_sqlite3.c @@ -239,6 +239,7 @@ gf_sqlite3_fill_db_operations(gfdb_db_operations_t *gfdb_db_ops) gfdb_db_ops->insert_record_op = gf_sqlite3_insert; gfdb_db_ops->delete_record_op = gf_sqlite3_delete; + gfdb_db_ops->compact_db_op = gf_sqlite3_vacuum; gfdb_db_ops->find_all_op = gf_sqlite3_find_all; gfdb_db_ops->find_unchanged_for_time_op = @@ -1327,10 +1328,14 @@ gf_sqlite3_pragma (void *db_conn, char *pragma_key, char **pragma_value) goto out; } - ret = gf_asprintf (pragma_value, "%s", sqlite3_column_text (pre_stmt, 0)); - if (ret <= 0) { - gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, LG_MSG_QUERY_FAILED, - "Failed to get %s from db", pragma_key); + if (pragma_value) { + ret = gf_asprintf (pragma_value, "%s", + sqlite3_column_text (pre_stmt, 0)); + if (ret <= 0) { + gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, + LG_MSG_QUERY_FAILED, "Failed to get %s from db", + pragma_key); + } } ret = 0; @@ -1382,3 +1387,177 @@ out: return ret; } + +/* Function to vacuum of sqlite db + * Input: + * void *db_conn : Sqlite connection + * gf_boolean_t compact_active : Is compaction on? + * gf_boolean_t compact_mode_switched : Did we just flip the compaction swtich? + * Return: + * On success return 0 + * On failure return -1 + * */ +int +gf_sqlite3_vacuum (void *db_conn, gf_boolean_t compact_active, + gf_boolean_t compact_mode_switched) +{ + int ret = -1; + gf_sql_connection_t *sql_conn = db_conn; + char *sqlstring = NULL; + char *sql_strerror = NULL; + gf_boolean_t changing_pragma = _gf_true; + + CHECK_SQL_CONN (sql_conn, out); + + if (GF_SQL_COMPACT_DEF == GF_SQL_COMPACT_NONE) { + gf_msg (GFDB_STR_SQLITE3, GF_LOG_INFO, 0, + LG_MSG_COMPACT_STATUS, + "VACUUM type is off: no VACUUM to do"); + goto out; + } + + if (compact_mode_switched) { + if (compact_active) { /* Then it was OFF before. + So turn everything on */ + ret = 0; + switch (GF_SQL_COMPACT_DEF) { + case GF_SQL_COMPACT_FULL: + ret = gf_sqlite3_set_pragma (db_conn, + "auto_vacuum", + GF_SQL_AV_FULL); + break; + case GF_SQL_COMPACT_INCR: + ret = gf_sqlite3_set_pragma (db_conn, + "auto_vacuum", + GF_SQL_AV_INCR); + break; + case GF_SQL_COMPACT_MANUAL: + changing_pragma = _gf_false; + default: + ret = -1; + gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, + LG_MSG_COMPACT_FAILED, + "VACUUM type undefined"); + goto out; + break; + } + + } else { /* Then it was ON before, so turn it all off */ + if (GF_SQL_COMPACT_DEF == GF_SQL_COMPACT_FULL || + GF_SQL_COMPACT_DEF == GF_SQL_COMPACT_INCR) { + ret = gf_sqlite3_set_pragma (db_conn, + "auto_vacuum", + GF_SQL_AV_NONE); + } else { + changing_pragma = _gf_false; + } + } + + if (ret) { + gf_msg (GFDB_STR_SQLITE3, GF_LOG_TRACE, 0, + LG_MSG_PREPARE_FAILED, + "Failed to set the pragma"); + goto out; + } + + gf_msg (GFDB_STR_SQLITE3, GF_LOG_INFO, 0, + LG_MSG_COMPACT_STATUS, "Turning compaction %i", + GF_SQL_COMPACT_DEF); + + /* If we move from an auto_vacuum scheme to off, */ + /* or vice-versa, we must VACUUM to save the change. */ + /* In the case of a manual VACUUM scheme, we might as well */ + /* run a manual VACUUM now if we */ + if (changing_pragma || compact_active) { + ret = gf_asprintf (&sqlstring, "VACUUM;"); + if (ret <= 0) { + gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, + LG_MSG_PREPARE_FAILED, + "Failed allocating memory"); + goto out; + } + gf_msg(GFDB_STR_SQLITE3, GF_LOG_INFO, 0, + LG_MSG_COMPACT_STATUS, "Sealed with a VACUUM"); + } + } else { /* We are active, so it's time to VACUUM */ + if (!compact_active) { /* Did we somehow enter an inconsistent + state? */ + ret = -1; + gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, + LG_MSG_PREPARE_FAILED, + "Tried to VACUUM when compaction inactive"); + goto out; + } + + gf_msg(GFDB_STR_SQLITE3, GF_LOG_TRACE, 0, + LG_MSG_COMPACT_STATUS, + "Doing regular vacuum of type %i", GF_SQL_COMPACT_DEF); + + switch (GF_SQL_COMPACT_DEF) { + case GF_SQL_COMPACT_INCR: /* INCR auto_vacuum */ + ret = gf_asprintf(&sqlstring, + "PRAGMA incremental_vacuum;"); + if (ret <= 0) { + gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, + LG_MSG_PREPARE_FAILED, + "Failed allocating memory"); + goto out; + } + gf_msg(GFDB_STR_SQLITE3, GF_LOG_INFO, 0, + LG_MSG_COMPACT_STATUS, + "Will commence an incremental VACUUM"); + break; + /* (MANUAL) Invoke the VACUUM command */ + case GF_SQL_COMPACT_MANUAL: + ret = gf_asprintf(&sqlstring, "VACUUM;"); + if (ret <= 0) { + gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, + LG_MSG_PREPARE_FAILED, + "Failed allocating memory"); + goto out; + } + gf_msg(GFDB_STR_SQLITE3, GF_LOG_INFO, 0, + LG_MSG_COMPACT_STATUS, + "Will commence a VACUUM"); + break; + /* (FULL) The database does the compaction itself. */ + /* We cannot do anything else, so we can leave */ + /* without sending anything to the database */ + case GF_SQL_COMPACT_FULL: + ret = 0; + goto success; + /* Any other state must be an error. Note that OFF */ + /* cannot hit this statement since we immediately leave */ + /* in that case */ + default: + ret = -1; + gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, + LG_MSG_COMPACT_FAILED, + "VACUUM type undefined"); + goto out; + break; + } + } + + gf_msg(GFDB_STR_SQLITE3, GF_LOG_TRACE, 0, LG_MSG_COMPACT_STATUS, + "SQLString == %s", sqlstring); + + ret = sqlite3_exec(sql_conn->sqlite3_db_conn, sqlstring, NULL, NULL, + &sql_strerror); + + if (ret != SQLITE_OK) { + gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, + LG_MSG_GET_RECORD_FAILED, "Failed to vacuum " + "the db : %s", sqlite3_errmsg (db_conn)); + ret = -1; + goto out; + } +success: + gf_msg(GFDB_STR_SQLITE3, GF_LOG_INFO, 0, LG_MSG_COMPACT_STATUS, + compact_mode_switched ? "Successfully changed VACUUM on/off" + : "DB successfully VACUUM"); +out: + GF_FREE(sqlstring); + + return ret; +} diff --git a/libglusterfs/src/gfdb/gfdb_sqlite3.h b/libglusterfs/src/gfdb/gfdb_sqlite3.h index 9d0d996a322..4d70a60e431 100644 --- a/libglusterfs/src/gfdb/gfdb_sqlite3.h +++ b/libglusterfs/src/gfdb/gfdb_sqlite3.h @@ -73,8 +73,7 @@ do {\ #define GF_SQL_AV_NONE "none" #define GF_SQL_AV_FULL "full" -#define GF_SQL_AV_INCR "incr" - +#define GF_SQL_AV_INCR "incremental" #define GF_SQL_SYNC_OFF "off" #define GF_SQL_SYNC_NORMAL "normal" @@ -87,7 +86,12 @@ do {\ #define GF_SQL_JM_WAL "wal" #define GF_SQL_JM_OFF "off" +#define GF_SQL_COMPACT_NONE 0 +#define GF_SQL_COMPACT_FULL 1 +#define GF_SQL_COMPACT_INCR 2 +#define GF_SQL_COMPACT_MANUAL 3 +#define GF_SQL_COMPACT_DEF GF_SQL_COMPACT_INCR typedef enum gf_sql_auto_vacuum { gf_sql_av_none = 0, gf_sql_av_full, @@ -319,7 +323,18 @@ int gf_sqlite3_pragma (void *db_conn, char *pragma_key, char **pragma_value); int gf_sqlite3_set_pragma (void *db_conn, char *pragma_key, char *pragma_value); - +/* Function to vacuum of sqlite db + * Input: + * void *db_conn : Sqlite connection + * gf_boolean_t compact_active : Is compaction on? + * gf_boolean_t compact_mode_switched : Did we just flip the compaction swtich? + * Return: + * On success return 0 + * On failure return -1 + * */ +int +gf_sqlite3_vacuum (void *db_conn, gf_boolean_t compact_active, + gf_boolean_t compact_mode_switched); void gf_sqlite3_fill_db_operations (gfdb_db_operations_t *gfdb_db_ops); diff --git a/libglusterfs/src/libglusterfs-messages.h b/libglusterfs/src/libglusterfs-messages.h index d2ad44e470e..29196929eb3 100644 --- a/libglusterfs/src/libglusterfs-messages.h +++ b/libglusterfs/src/libglusterfs-messages.h @@ -36,7 +36,9 @@ */ #define GLFS_LG_BASE GLFS_MSGID_COMP_LIBGLUSTERFS -#define GLFS_LG_NUM_MESSAGES 207 + +#define GLFS_LG_NUM_MESSAGES 209 + #define GLFS_LG_MSGID_END (GLFS_LG_BASE + GLFS_LG_NUM_MESSAGES + 1) /* Messaged with message IDs */ #define glfs_msg_start_lg GLFS_LG_BASE, "Invalid: Start of messages" @@ -1762,6 +1764,7 @@ * @recommendedaction * */ + #define LG_MSG_INVALID_INODE_LIST (GLFS_LG_BASE + 207) /*! @@ -1770,6 +1773,24 @@ * @recommendedaction * */ + +#define LG_MSG_COMPACT_FAILED (GLFS_LG_BASE + 208) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + * + */ + +#define LG_MSG_COMPACT_STATUS (GLFS_LG_BASE + 209) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + * + */ /*------------*/ #define glfs_msg_end_lg GLFS_LG_MSGID_END, "Invalid: End of messages" diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 3717a68273c..da1bcb6a4a1 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -310,7 +310,7 @@ typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { double avail_percent; - double avail_inodes; + double avail_inodes; uint64_t avail_space; uint32_t log; uint32_t chunks; @@ -325,10 +325,10 @@ enum gf_defrag_type { GF_DEFRAG_CMD_START_FORCE = 1 + 4, GF_DEFRAG_CMD_START_TIER = 1 + 5, GF_DEFRAG_CMD_STATUS_TIER = 1 + 6, - GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7, - GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8, - GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9, - GF_DEFRAG_CMD_RESUME_TIER = 1 + 10, + GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7, + GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8, + GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9, + GF_DEFRAG_CMD_RESUME_TIER = 1 + 10, }; typedef enum gf_defrag_type gf_defrag_type; @@ -398,9 +398,26 @@ typedef struct gf_tier_conf { uint64_t max_migrate_bytes; int max_migrate_files; tier_mode_t mode; + /* These flags are only used for tier-compact */ + gf_boolean_t compact_active; + /* These 3 flags are set to true when the client changes the */ + /* compaction mode on the command line. */ + /* When they are set, the daemon will trigger compaction as */ + /* soon as possible to activate or deactivate compaction. */ + /* If in the middle of a compaction, then the switches take */ + /* effect on the next compaction, not the current one. */ + /* If the user switches it off, we want to avoid needless */ + /* compactions. */ + /* If the user switches it on, they want to compact as soon */ + /* as possible. */ + gf_boolean_t compact_mode_switched; + gf_boolean_t compact_mode_switched_hot; + gf_boolean_t compact_mode_switched_cold; int tier_max_promote_size; int tier_promote_frequency; int tier_demote_frequency; + int tier_compact_hot_frequency; + int tier_compact_cold_frequency; uint64_t st_last_promoted_size; uint64_t st_last_demoted_size; tier_pause_state_t pause_state; @@ -1023,9 +1040,9 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata); int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t mode, off_t offset, size_t len, dict_t *xdata); + int32_t mode, off_t offset, size_t len, dict_t *xdata); int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, size_t len, dict_t *xdata); + off_t offset, size_t len, dict_t *xdata); int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, off_t len, dict_t *xdata); int32_t dht_ipc (call_frame_t *frame, xlator_t *this, int32_t op, diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 873ced53eec..f410f71b5a6 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -148,7 +148,7 @@ dht_priv_dump (xlator_t *this) gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); gf_proc_dump_write("gen", "%d", conf->gen); gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); - gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); @@ -433,14 +433,14 @@ dht_reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("lookup-optimize", conf->lookup_optimize, options, bool, out); - GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, + GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, percent_or_size, out); /* option can be any one of percent or bytes */ conf->disk_unit = 0; if (conf->min_free_disk < 100.0) conf->disk_unit = 'p'; - GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, + GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, percent, out); GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, @@ -711,8 +711,8 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); - GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); + GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + err); GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, err); @@ -901,7 +901,7 @@ struct volume_options options[] = { "process starts balancing out the cluster, and logs will appear " "in log files", }, - { .key = {"min-free-inodes"}, + { .key = {"min-free-inodes"}, .type = GF_OPTION_TYPE_PERCENT, .default_value = "5%", .description = "after system has only N% of inodes, warnings " @@ -1038,6 +1038,20 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_STR, .default_value = "test", }, + { .key = {"tier-compact"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + { .key = {"tier-hot-compact-frequency"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "604800", + .description = "Frequency to compact DBs on hot tier in system" + }, + { .key = {"tier-cold-compact-frequency"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "604800", + .description = "Frequency to compact DBs on cold tier in system" + }, { .key = {"tier-max-mb"}, .type = GF_OPTION_TYPE_INT, .default_value = "4000", diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index 0d53a62d327..7e5e1004b84 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -1240,15 +1240,15 @@ tier_process_ctr_query (tier_brick_list_t *local_brick, void *args) gfdb_brick_info->time_stamp, sizeof (gfdb_time_t)); - SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, - GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_QUERY_OPS, - ret, out); + SET_DB_PARAM_TO_DICT (this->name, ctr_ipc_in_dict, + GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_QUERY_OPS, + ret, out); - SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, - GFDB_IPC_CTR_GET_QFILE_PATH, - local_brick->qfile_path, - ret, out); + SET_DB_PARAM_TO_DICT (this->name, ctr_ipc_in_dict, + GFDB_IPC_CTR_GET_QFILE_PATH, + local_brick->qfile_path, + ret, out); ret = dict_set_bin (ctr_ipc_in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS, ipc_ctr_params, sizeof (*ipc_ctr_params)); @@ -1360,7 +1360,7 @@ tier_process_brick (tier_brick_list_t *local_brick, void *args) { gf_msg ("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED, "Failed to set %s " "to params dictionary", - GFDB_IPC_CTR_GET_DB_KEY);\ + GFDB_IPC_CTR_GET_DB_KEY); goto out; } @@ -1442,11 +1442,12 @@ tier_build_migration_qfile (migration_args_t *args, } time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; - /* The migration daemon may run a varrying numberof usec after the sleep */ - /* call triggers. A file may be registered in CTR some number of usec X */ - /* after the daemon started and missed in the subsequent cycle if the */ - /* daemon starts Y usec after the period in seconds where Y>X. Normalize */ - /* away this problem by always setting usec to 0. */ + /* The migration daemon may run a varying numberof usec after the */ + /* sleep call triggers. A file may be registered in CTR some number */ + /* of usec X after the daemon started and missed in the subsequent */ + /* cycle if the daemon starts Y usec after the period in seconds */ + /* where Y>X. Normalize away this problem by always setting usec */ + /* to 0. */ time_in_past.tv_usec = 0; gfdb_brick_info.time_stamp = &time_in_past; @@ -1649,6 +1650,265 @@ out: return ret; } + +/* + * Command the CTR on a brick to compact the local database using an IPC + */ +static int +tier_process_self_compact (tier_brick_list_t *local_brick, void *args) +{ + int ret = -1; + char *db_path = NULL; + query_cbk_args_t *query_cbk_args = NULL; + xlator_t *this = NULL; + gfdb_conn_node_t *conn_node = NULL; + dict_t *params_dict = NULL; + dict_t *ctr_ipc_dict = NULL; + gfdb_brick_info_t *gfdb_brick_info = args; + int is_changing = -1; + + /*Init of all the essentials*/ + GF_VALIDATE_OR_GOTO ("tier", gfdb_brick_info , out); + query_cbk_args = gfdb_brick_info->_query_cbk_args; + + GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out); + this = query_cbk_args->this; + + GF_VALIDATE_OR_GOTO (this->name, + gfdb_brick_info->_query_cbk_args, out); + + GF_VALIDATE_OR_GOTO (this->name, local_brick, out); + + GF_VALIDATE_OR_GOTO (this->name, local_brick->xlator, out); + + GF_VALIDATE_OR_GOTO (this->name, local_brick->brick_db_path, out); + + db_path = local_brick->brick_db_path; + + /*Preparing DB parameters before init_db i.e getting db connection*/ + params_dict = dict_new (); + if (!params_dict) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "DB Params cannot initialized"); + goto out; + } + SET_DB_PARAM_TO_DICT (this->name, params_dict, + (char *) gfdb_methods.get_db_path_key(), db_path, + ret, out); + + /*Get the db connection*/ + conn_node = gfdb_methods.init_db ((void *)params_dict, + dht_tier_db_type); + if (!conn_node) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "FATAL: Failed initializing db operations"); + goto out; + } + + ret = 0; + + /*Preparing ctr_ipc_dict*/ + ctr_ipc_dict = dict_new (); + if (!ctr_ipc_dict) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "ctr_ipc_dict cannot initialized"); + goto out; + } + + ret = dict_set_int32 (ctr_ipc_dict, "compact_active", + query_cbk_args->defrag-> + tier_conf.compact_active); + + if (ret) { + gf_msg ("tier", GF_LOG_ERROR, 0, + LG_MSG_SET_PARAM_FAILED, "Failed to set %s " + "to params dictionary", + "compact_active"); + goto out; + } + + ret = dict_set_int32 (ctr_ipc_dict, "compact_mode_switched", + query_cbk_args->defrag-> + tier_conf.compact_mode_switched); + + if (ret) { + gf_msg ("tier", GF_LOG_ERROR, 0, + LG_MSG_SET_PARAM_FAILED, "Failed to set %s " + "to params dictionary", + "compact_mode_switched"); + goto out; + } + + SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, + GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_SET_COMPACT_PRAGMA, + ret, out); + + gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, + "Starting Compaction IPC"); + + ret = syncop_ipc (local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict, + NULL); + + gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, + "Ending Compaction IPC"); + + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, "Failed compaction " + "on db %s error %d", local_brick->brick_db_path, ret); + goto out; + } + + gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, + "SUCCESS: %s Compaction", local_brick->brick_name); + + ret = 0; +out: + if (params_dict) { + dict_unref (params_dict); + params_dict = NULL; + } + + if (ctr_ipc_dict) { + dict_unref (ctr_ipc_dict); + ctr_ipc_dict = NULL; + } + + gfdb_methods.fini_db (conn_node); + + return ret; +} + +/* + * This is the call back function for each brick from hot/cold bricklist. + * It determines the database type on each brick and calls the corresponding + * function to prepare the compaction IPC. + */ +static int +tier_compact_db_brick (tier_brick_list_t *local_brick, void *args) +{ + int ret = -1; + char *strval = NULL; + + GF_VALIDATE_OR_GOTO ("tier", local_brick, out); + + GF_VALIDATE_OR_GOTO ("tier", local_brick->xlator, out); + + ret = tier_process_self_compact (local_brick, args); + if (ret) { + gf_msg ("tier", GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, + "Brick %s did not compact", + local_brick->brick_name); + goto out; + } + + ret = 0; + +out: + + return ret; +} + +static int +tier_send_compact (migration_args_t *args, + query_cbk_args_t *query_cbk_args) +{ + gfdb_time_t current_time; + gfdb_brick_info_t gfdb_brick_info; + gfdb_time_t time_in_past; + int ret = -1; + tier_brick_list_t *local_brick = NULL; + + time_in_past.tv_sec = args->freq_time; + time_in_past.tv_usec = 0; + + ret = gettimeofday (¤t_time, NULL); + if (ret == -1) { + gf_msg (args->this->name, GF_LOG_ERROR, errno, + DHT_MSG_SYS_CALL_GET_TIME_FAILED, + "Failed to get current time"); + goto out; + } + time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; + + /* The migration daemon may run a varying numberof usec after the sleep + call triggers. A file may be registered in CTR some number of usec X + after the daemon started and missed in the subsequent cycle if the + daemon starts Y usec after the period in seconds where Y>X. Normalize + away this problem by always setting usec to 0. */ + time_in_past.tv_usec = 0; + + gfdb_brick_info.time_stamp = &time_in_past; + + /* This is meant to say we are always compacting at this point */ + /* We simply borrow the promotion flag to do this */ + gfdb_brick_info._gfdb_promote = 1; + + gfdb_brick_info._query_cbk_args = query_cbk_args; + + list_for_each_entry (local_brick, args->brick_list, list) { + + gf_msg (args->this->name, GF_LOG_TRACE, 0, + DHT_MSG_LOG_TIER_STATUS, + "Start compaction for %s", + local_brick->brick_name); + + ret = tier_compact_db_brick (local_brick, + &gfdb_brick_info); + if (ret) { + gf_msg (args->this->name, GF_LOG_ERROR, 0, + DHT_MSG_BRICK_QUERY_FAILED, + "Brick %s compaction failed\n", + local_brick->brick_db_path); + } + + gf_msg (args->this->name, GF_LOG_TRACE, 0, + DHT_MSG_LOG_TIER_STATUS, + "End compaction for %s", + local_brick->brick_name); + + } + ret = 0; +out: + return ret; +} + +static int +tier_compact (void *args) +{ + int ret = -1; + query_cbk_args_t query_cbk_args; + migration_args_t *compaction_args = args; + + GF_VALIDATE_OR_GOTO ("tier", compaction_args->this, out); + GF_VALIDATE_OR_GOTO (compaction_args->this->name, + compaction_args->brick_list, out); + GF_VALIDATE_OR_GOTO (compaction_args->this->name, + compaction_args->defrag, out); + + THIS = compaction_args->this; + + query_cbk_args.this = compaction_args->this; + query_cbk_args.defrag = compaction_args->defrag; + query_cbk_args.is_compaction = 1; + + /* Send the compaction pragma out to all the bricks on the bricklist. */ + /* tier_get_bricklist ensures all bricks on the list are local to */ + /* this node. */ + ret = tier_send_compact (compaction_args, &query_cbk_args); + if (ret) + goto out; + + ret = 0; +out: + compaction_args->return_value = ret; + return ret; + } + static int tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head) { @@ -1755,6 +2015,18 @@ tier_get_freq_promote (gf_tier_conf_t *tier_conf) return tier_conf->tier_promote_frequency; } +int +tier_get_freq_compact_hot (gf_tier_conf_t *tier_conf) +{ + return tier_conf->tier_compact_hot_frequency; +} + +int +tier_get_freq_compact_cold (gf_tier_conf_t *tier_conf) +{ + return tier_conf->tier_compact_cold_frequency; +} + static int tier_check_demote (gfdb_time_t current_time, int freq) { @@ -1776,8 +2048,21 @@ tier_check_promote (gf_tier_conf_t *tier_conf, _gf_true : _gf_false; } +static gf_boolean_t +tier_check_compact (gf_tier_conf_t *tier_conf, + gfdb_time_t current_time, + int freq_compact) +{ + + if (!(tier_conf->compact_active || + tier_conf->compact_mode_switched)) + return _gf_false; + return ((current_time.tv_sec % freq_compact) == 0) ? + _gf_true : _gf_false; +} + void clear_bricklist (struct list_head *brick_list) @@ -1824,6 +2109,72 @@ out: return; } +static int +tier_prepare_compact (migration_args_t *args, gfdb_time_t current_time) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + gf_tier_conf_t *tier_conf = NULL; + gf_boolean_t is_hot_tier = _gf_false; + int freq = 0; + int ret = -1; + const char *tier_type = is_hot_tier ? "hot" : "cold"; + + this = args->this; + + conf = this->private; + + defrag = conf->defrag; + + tier_conf = &defrag->tier_conf; + + is_hot_tier = args->is_hot_tier; + + freq = is_hot_tier ? tier_get_freq_compact_hot (tier_conf) : + tier_get_freq_compact_cold (tier_conf); + + defrag->tier_conf.compact_mode_switched = is_hot_tier ? + defrag->tier_conf.compact_mode_switched_hot : + defrag->tier_conf.compact_mode_switched_cold; + + gf_msg(this->name, GF_LOG_TRACE, 0, + DHT_MSG_LOG_TIER_STATUS, + "Compact mode %i", + defrag->tier_conf.compact_mode_switched); + + if (tier_check_compact (tier_conf, current_time, + freq)) { + gf_msg (this->name, GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, + "Start compaction on %s tier", + tier_type); + + args->freq_time = freq; + ret = tier_compact (args); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, "Compaction failed on " + "%s tier", tier_type); + goto out; + } + + gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, + "End compaction on %s tier", tier_type); + + if (is_hot_tier) { + defrag->tier_conf.compact_mode_switched_hot = + _gf_false; + } else { + defrag->tier_conf.compact_mode_switched_cold = + _gf_false; + } + } + +out: + return ret; +} + /* * Main tiering loop. This is called from the promotion and the * demotion threads spawned in tier_start(). @@ -1846,8 +2197,9 @@ static void int check_watermark = 0; gf_defrag_info_t *defrag = NULL; xlator_t *this = NULL; + struct list_head *bricklist_temp = NULL; migration_args_t *args = in_args; - + gf_boolean_t compacted = _gf_false; GF_VALIDATE_OR_GOTO ("tier", args, out); GF_VALIDATE_OR_GOTO ("tier", args->brick_list, out); @@ -1884,7 +2236,8 @@ static void if (xlator != this) { gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, - "Detected graph switch. Exiting migration daemon."); + "Detected graph switch. Exiting migration " + "daemon."); goto out; } @@ -1912,10 +2265,10 @@ static void goto out; } - if (gf_defrag_get_pause_state (&defrag->tier_conf) != TIER_RUNNING) + if (gf_defrag_get_pause_state (&defrag->tier_conf) != + TIER_RUNNING) continue; - /* To have proper synchronization amongst all * brick holding nodes, so that promotion and demotions * start atomicly w.r.t promotion/demotion frequency @@ -1950,7 +2303,6 @@ static void } if (args->is_promotion) { - freq = tier_get_freq_promote (tier_conf); if (tier_check_promote (tier_conf, current_time, freq)) { @@ -1962,21 +2314,22 @@ static void "Promotion failed"); } } - + } else if (args->is_compaction) { + tier_prepare_compact (args, current_time); } else { - freq = tier_get_freq_demote (tier_conf); if (tier_check_demote (current_time, freq)) { args->freq_time = freq; ret = tier_demote (args); if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, + gf_msg (this->name, + GF_LOG_ERROR, + 0, DHT_MSG_LOG_TIER_ERROR, "Demotion failed"); } } - } /* Check the statfs immediately after the processing threads @@ -1997,11 +2350,15 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) { pthread_t promote_thread; pthread_t demote_thread; + pthread_t hot_compact_thread; + pthread_t cold_compact_thread; int ret = -1; struct list_head bricklist_hot = { 0 }; struct list_head bricklist_cold = { 0 }; migration_args_t promotion_args = { 0 }; migration_args_t demotion_args = { 0 }; + migration_args_t hot_compaction_args = { 0 }; + migration_args_t cold_compaction_args = { 0 }; dht_conf_t *conf = NULL; INIT_LIST_HEAD ((&bricklist_hot)); @@ -2016,6 +2373,7 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) demotion_args.brick_list = &bricklist_hot; demotion_args.defrag = defrag; demotion_args.is_promotion = _gf_false; + demotion_args.is_compaction = _gf_false; ret = pthread_create (&demote_thread, NULL, &tier_run, @@ -2047,6 +2405,47 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) goto waitforspawned; } + hot_compaction_args.this = this; + hot_compaction_args.brick_list = &bricklist_hot; + hot_compaction_args.defrag = defrag; + hot_compaction_args.is_promotion = _gf_false; + hot_compaction_args.is_compaction = _gf_true; + hot_compaction_args.is_hot_tier = _gf_true; + + ret = pthread_create (&hot_compact_thread, + NULL, &tier_run, + &hot_compaction_args); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "Failed to start compaction thread."); + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + goto waitforspawnedpromote; + } + + cold_compaction_args.this = this; + cold_compaction_args.brick_list = &bricklist_cold; + cold_compaction_args.defrag = defrag; + cold_compaction_args.is_promotion = _gf_false; + cold_compaction_args.is_compaction = _gf_true; + cold_compaction_args.is_hot_tier = _gf_false; + + ret = pthread_create (&cold_compact_thread, + NULL, &tier_run, + &cold_compaction_args); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "Failed to start compaction thread."); + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + goto waitforspawnedhotcompact; + } + pthread_join (cold_compact_thread, NULL); + +waitforspawnedhotcompact: + pthread_join (hot_compact_thread, NULL); + +waitforspawnedpromote: pthread_join (promote_thread, NULL); waitforspawned: @@ -2055,7 +2454,6 @@ waitforspawned: cleanup: clear_bricklist (&bricklist_cold); clear_bricklist (&bricklist_hot); - return ret; } @@ -2167,8 +2565,8 @@ out: return ret; } -static -int tier_validate_mode (char *mode) +static int +tier_validate_mode (char *mode) { int ret = -1; @@ -2181,6 +2579,26 @@ int tier_validate_mode (char *mode) return ret; } +static gf_boolean_t +tier_validate_compact_mode (char *mode) +{ + gf_boolean_t ret = _gf_false; + + gf_msg ("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, + "tier_validate_compact_mode: mode = %s", mode); + + if (!strcmp (mode, "on")) { + ret = _gf_true; + } else { + ret = _gf_false; + } + + gf_msg ("tier", GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_STATUS, + "tier_validate_compact_mode: ret = %i", ret); + + return ret; +} int tier_init_methods (xlator_t *this) @@ -2205,8 +2623,6 @@ err: return ret; } - - int tier_init (xlator_t *this) { @@ -2291,6 +2707,22 @@ tier_init (xlator_t *this) defrag->tier_conf.tier_demote_frequency = freq; ret = dict_get_int32 (this->options, + "tier-hot-compact-frequency", &freq); + if (ret) { + freq = DEFAULT_HOT_COMPACT_FREQ_SEC; + } + + defrag->tier_conf.tier_compact_hot_frequency = freq; + + ret = dict_get_int32 (this->options, + "tier-cold-compact-frequency", &freq); + if (ret) { + freq = DEFAULT_COLD_COMPACT_FREQ_SEC; + } + + defrag->tier_conf.tier_compact_cold_frequency = freq; + + ret = dict_get_int32 (this->options, "watermark-hi", &freq); if (ret) { freq = DEFAULT_WM_HI; @@ -2339,6 +2771,29 @@ tier_init (xlator_t *this) defrag->tier_conf.max_migrate_files = freq; ret = dict_get_str (this->options, + "tier-compact", &mode); + + if (ret) { + defrag->tier_conf.compact_active = DEFAULT_COMP_MODE; + } else { + ret = tier_validate_compact_mode (mode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "tier_init failed - invalid compaction mode"); + goto out; + } + + /* If compaction is now active, we need to inform the bricks on + the hot and cold tier of this. See dht-common.h for more. */ + defrag->tier_conf.compact_active = ret; + if (ret) { + defrag->tier_conf.compact_mode_switched_hot = _gf_true; + defrag->tier_conf.compact_mode_switched_cold = _gf_true; + } + } + + ret = dict_get_str (this->options, "tier-mode", &mode); if (ret) { defrag->tier_conf.mode = DEFAULT_TIER_MODE; @@ -2361,7 +2816,8 @@ tier_init (xlator_t *this) "tier-pause", &paused); if (paused && strcmp (paused, "on") == 0) - gf_defrag_set_pause_state (&defrag->tier_conf, TIER_REQUEST_PAUSE); + gf_defrag_set_pause_state (&defrag->tier_conf, + TIER_REQUEST_PAUSE); ret = gf_asprintf(&voldir, "%s/%s", DEFAULT_VAR_RUN_DIRECTORY, @@ -2411,7 +2867,6 @@ out: return ret; } - int tier_cli_pause_done (int op_ret, call_frame_t *sync_frame, void *data) { @@ -2445,17 +2900,17 @@ exit: return ret; } - int tier_reconfigure (xlator_t *this, dict_t *options) { - dht_conf_t *conf = NULL; - gf_defrag_info_t *defrag = NULL; - char *mode = NULL; - int migrate_mb = 0; - gf_boolean_t req_pause = _gf_false; - int ret = 0; - call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + char *mode = NULL; + int migrate_mb = 0; + gf_boolean_t req_pause = _gf_false; + int ret = 0; + call_frame_t *frame = NULL; + gf_boolean_t last_compact_setting = _gf_false; conf = this->private; @@ -2489,6 +2944,28 @@ tier_reconfigure (xlator_t *this, dict_t *options) defrag->tier_conf.watermark_low, options, int32, out); + last_compact_setting = defrag->tier_conf.compact_active; + + GF_OPTION_RECONF ("tier-compact", + defrag->tier_conf.compact_active, options, + bool, out); + + if (last_compact_setting != defrag->tier_conf.compact_active) { + defrag->tier_conf.compact_mode_switched_hot = _gf_true; + defrag->tier_conf.compact_mode_switched_cold = _gf_true; + gf_msg (this->name, GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, + "compact mode switched"); + } + + GF_OPTION_RECONF ("tier-hot-compact-frequency", + defrag->tier_conf.tier_compact_hot_frequency, + options, int32, out); + + GF_OPTION_RECONF ("tier-cold-compact-frequency", + defrag->tier_conf.tier_compact_cold_frequency, + options, int32, out); + GF_OPTION_RECONF ("tier-mode", mode, options, str, out); @@ -2558,7 +3035,6 @@ class_methods_t class_methods = { .notify = dht_notify }; - struct xlator_fops fops = { .lookup = dht_lookup, @@ -2611,9 +3087,7 @@ struct xlator_fops fops = { .zerofill = dht_zerofill, }; - struct xlator_cbks cbks = { .release = dht_release, .forget = dht_forget }; - diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h index 0807608fda2..ffb04173bd5 100644 --- a/xlators/cluster/dht/src/tier.h +++ b/xlators/cluster/dht/src/tier.h @@ -54,6 +54,7 @@ typedef struct _query_cbk_args { /* This is write */ int query_fd; int is_promotion; + int is_compaction; /* This is for read */ tier_qfile_array_t *qfile_array; } query_cbk_args_t; @@ -82,6 +83,8 @@ typedef struct _dm_thread_args { int freq_time; int return_value; int is_promotion; + int is_compaction; + gf_boolean_t is_hot_tier; } migration_args_t; typedef enum tier_watermark_op_ { @@ -93,12 +96,15 @@ typedef enum tier_watermark_op_ { #define DEFAULT_PROMOTE_FREQ_SEC 120 #define DEFAULT_DEMOTE_FREQ_SEC 120 +#define DEFAULT_HOT_COMPACT_FREQ_SEC 604800 +#define DEFAULT_COLD_COMPACT_FREQ_SEC 604800 #define DEFAULT_DEMOTE_DEGRADED 10 #define DEFAULT_WRITE_FREQ_SEC 0 #define DEFAULT_READ_FREQ_SEC 0 #define DEFAULT_WM_LOW 75 #define DEFAULT_WM_HI 90 #define DEFAULT_TIER_MODE TIER_MODE_TEST +#define DEFAULT_COMP_MODE _gf_true #define DEFAULT_TIER_MAX_MIGRATE_MB 1000 #define DEFAULT_TIER_MAX_MIGRATE_FILES 5000 diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.c b/xlators/features/changetimerecorder/src/changetimerecorder.c index 5f3a074acd5..933f496028c 100644 --- a/xlators/features/changetimerecorder/src/changetimerecorder.c +++ b/xlators/features/changetimerecorder/src/changetimerecorder.c @@ -15,6 +15,8 @@ #include "ctr-messages.h" #include "syscall.h" +#include "changetimerecorder.h" + /*******************************inode forget***********************************/ int @@ -1789,6 +1791,61 @@ out: return ret; } +void * +ctr_compact_thread (void *args) +{ + int ret = -1; + void *db_conn = NULL; + + xlator_t *this = NULL; + gf_ctr_private_t *priv = NULL; + gf_boolean_t compact_active = _gf_false; + gf_boolean_t compact_mode_switched = _gf_false; + + this = (xlator_t *)args; + + GF_VALIDATE_OR_GOTO("ctr", this, out); + + priv = this->private; + + db_conn = priv->_db_conn; + compact_active = priv->compact_active; + compact_mode_switched = priv->compact_mode_switched; + + gf_msg ("ctr-compact", GF_LOG_INFO, 0, CTR_MSG_SET, + "Starting compaction"); + + ret = compact_db(db_conn, compact_active, + compact_mode_switched); + + if (ret) { + gf_msg ("ctr-compact", GF_LOG_ERROR, 0, CTR_MSG_SET, + "Failed to perform the compaction"); + } + + ret = pthread_mutex_lock (&priv->compact_lock); + + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, + "Failed to acquire lock"); + goto out; + } + + /* We are done compaction on this brick. Set all flags to false */ + priv->compact_active = _gf_false; + priv->compact_mode_switched = _gf_false; + + ret = pthread_mutex_unlock (&priv->compact_lock); + + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, + "Failed to release lock"); + goto out; + } + +out: + return NULL; +} int ctr_ipc_helper (xlator_t *this, dict_t *in_dict, @@ -1802,7 +1859,8 @@ ctr_ipc_helper (xlator_t *this, dict_t *in_dict, char *db_param = NULL; char *query_file = NULL; gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL; - + int result = 0; + pthread_t compact_thread; GF_VALIDATE_OR_GOTO ("ctr", this, out); GF_VALIDATE_OR_GOTO (this->name, this->private, out); @@ -1888,12 +1946,78 @@ ctr_ipc_helper (xlator_t *this, dict_t *in_dict, SET_DB_PARAM_TO_DICT(this->name, out_dict, db_param_key, db_param, ret, error); + } /* if its an attempt to compact the database */ + else if (strncmp (ctr_ipc_ops, GFDB_IPC_CTR_SET_COMPACT_PRAGMA, + strlen (GFDB_IPC_CTR_SET_COMPACT_PRAGMA)) == 0) { + + ret = pthread_mutex_lock (&priv->compact_lock); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, + "Failed to acquire lock for compaction"); + goto out; + } + + if ((priv->compact_active || priv->compact_mode_switched)) { + /* Compaction in progress. LEAVE */ + gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, + "Compaction already in progress."); + pthread_mutex_unlock (&priv->compact_lock); + goto out; + } + /* At this point, we should be the only one on the brick */ + /* compacting */ + + /* Grab the arguments from the dictionary */ + ret = dict_get_int32 (in_dict, "compact_active", &result); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, + "Failed to get compaction type"); + goto out; + } + + if (result) { + priv->compact_active = _gf_true; + } + + ret = dict_get_int32 (in_dict, "compact_mode_switched" + , &result); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, + "Failed to see if compaction switched"); + goto out; + } + + if (result) { + priv->compact_mode_switched = _gf_true; + gf_msg ("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET, + "Pre-thread: Compact mode switch is true"); + } else { + gf_msg ("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET, + "Pre-thread: Compact mode switch is false"); + } + + ret = pthread_mutex_unlock (&priv->compact_lock); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, + "Failed to release lock for compaction"); + goto out; + } + + ret = pthread_create (&compact_thread, NULL, ctr_compact_thread, + (void *)this); + + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, + "Failed to spawn compaction thread"); + goto out; + } + + goto out; } /* default case */ else { goto out; } - ret = 0; goto out; error: @@ -2079,6 +2203,18 @@ init (xlator_t *this) priv->ctr_lookupheal_inode_timeout = CTR_DEFAULT_INODE_EXP_PERIOD; + /* For compaction */ + priv->compact_active = _gf_false; + priv->compact_mode_switched = _gf_false; + ret_db = pthread_mutex_init (&priv->compact_lock, NULL); + + if (ret_db) { + gf_msg (this->name, GF_LOG_ERROR, 0, + CTR_MSG_FATAL_ERROR, + "FATAL: Failed initializing compaction mutex"); + goto error; + } + /*Extract ctr xlator options*/ ret_db = extract_ctr_options (this, priv); if (ret_db) { @@ -2123,6 +2259,7 @@ init (xlator_t *this) goto error; } + ret_db = 0; goto out; @@ -2185,6 +2322,11 @@ fini (xlator_t *this) "db connection"); } GF_FREE (priv->ctr_db_path); + if (pthread_mutex_destroy (&priv->compact_lock)) { + gf_msg (this->name, GF_LOG_WARNING, 0, + CTR_MSG_CLOSE_DB_CONN_FAILED, "Failed to " + "destroy the compaction mutex"); + } } GF_FREE (priv); mem_pool_destroy (this->local_pool); diff --git a/xlators/features/changetimerecorder/src/ctr-helper.h b/xlators/features/changetimerecorder/src/ctr-helper.h index d5615270184..4fd4f745f4d 100644 --- a/xlators/features/changetimerecorder/src/ctr-helper.h +++ b/xlators/features/changetimerecorder/src/ctr-helper.h @@ -22,6 +22,7 @@ #include "common-utils.h" #include <time.h> #include <sys/time.h> +#include <pthread.h> #include "gfdb_data_store.h" #include "ctr-xlator-ctx.h" @@ -52,6 +53,9 @@ typedef struct gf_ctr_private { gfdb_conn_node_t *_db_conn; uint64_t ctr_lookupheal_link_timeout; uint64_t ctr_lookupheal_inode_timeout; + gf_boolean_t compact_active; + gf_boolean_t compact_mode_switched; + pthread_mutex_t compact_lock; } gf_ctr_private_t; diff --git a/xlators/mgmt/glusterd/src/glusterd-messages.h b/xlators/mgmt/glusterd/src/glusterd-messages.h index e520c69add2..2ba1876b6ec 100644 --- a/xlators/mgmt/glusterd/src/glusterd-messages.h +++ b/xlators/mgmt/glusterd/src/glusterd-messages.h @@ -4755,4 +4755,3 @@ /*------------*/ #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" #endif /* !_GLUSTERD_MESSAGES_H_ */ - diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index ce34ffd2b05..d87082e9e89 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -387,6 +387,14 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key, goto out; } goto out; + } else if (strstr (key, "tier-compact")) { + if (strcmp (value, "on") && + strcmp (value, "off")) { + ret = -1; + goto out; + } + + goto out; } /* @@ -452,7 +460,9 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key, strstr (key, "tier-max-mb") || strstr (key, "tier-max-promote-file-size") || strstr (key, "tier-max-files") || - strstr (key, "tier-demote-frequency")) { + strstr (key, "tier-demote-frequency") || + strstr (key, "tier-hot-compact-frequency") || + strstr (key, "tier-cold-compact-frequency")) { if (origin_val < 1) { snprintf (errstr, sizeof (errstr), "%s is not a " " compatible value. %s expects a positive " @@ -464,7 +474,6 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key, ret = -1; goto out; } - } out: gf_msg_debug (this->name, 0, "Returning %d", ret); @@ -1589,17 +1598,17 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = OPT_FLAG_CLIENT_OPT }, - /* Crypt xlator options */ + /* Crypt xlator options */ - { .key = "features.encryption", - .voltype = "encryption/crypt", - .option = "!feat", - .value = "off", - .op_version = 3, - .description = "enable/disable client-side encryption for " + { .key = "features.encryption", + .voltype = "encryption/crypt", + .option = "!feat", + .value = "off", + .op_version = 3, + .description = "enable/disable client-side encryption for " "the volume.", - .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT - }, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT + }, { .key = "encryption.master-key", .voltype = "encryption/crypt", @@ -1968,7 +1977,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = OPT_FLAG_CLIENT_OPT }, - /* Feature translators */ + /* Feature translators */ { .key = "features.uss", .voltype = "features/snapview-server", .op_version = GD_OP_VERSION_3_6_0, @@ -2730,6 +2739,32 @@ struct volopt_map_entry glusterd_volopt_map[] = { .description = "The maximum number of files that may be migrated" " in any direction in a given cycle by a single node." }, + { .key = "cluster.tier-compact", + .voltype = "cluster/tier", + .option = "tier-compact", + .value = "on", + .op_version = GD_OP_VERSION_3_9_0, + .flags = OPT_FLAG_CLIENT_OPT, + .validate_fn = validate_tier, + .description = "Activate or deactivate the compaction of the DB" + " for the volume's metadata." + }, + { .key = "cluster.tier-hot-compact-frequency", + .voltype = "cluster/tier", + .value = "604800", + .option = "tier-hot-compact-frequency", + .op_version = GD_OP_VERSION_3_9_0, + .flags = OPT_FLAG_CLIENT_OPT, + .validate_fn = validate_tier, + }, + { .key = "cluster.tier-cold-compact-frequency", + .voltype = "cluster/tier", + .value = "604800", + .option = "tier-cold-compact-frequency", + .op_version = GD_OP_VERSION_3_9_0, + .flags = OPT_FLAG_CLIENT_OPT, + .validate_fn = validate_tier, + }, { .key = "features.ctr-enabled", .voltype = "features/changetimerecorder", .value = "off", |