diff options
author | Venky Shankar <vshankar@redhat.com> | 2015-04-27 21:34:34 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2015-05-07 22:51:41 -0700 |
commit | 9ba8963999bca431ec14a25961a163810cfe1e5b (patch) | |
tree | 783f5a29b7cfc63331a88a1ec5d222a7a4c2d57e | |
parent | 4ccd70b323d4cb929b7b7a88e592fc98fab06198 (diff) |
features/bitrot: Throttle filesystem scrubber
This patch introduces multithreaded filesystem scrubber based
on throttling option configured for a particular volume. The
implementation "logically" breaks scanning and scrubbing with
the number of scrubber threads auto-configured depending upon
the throttle configuration. Scanning (crawling) is left single
threaded (per brick) with entries scrubbed in bulk. On reaching
this "bulk" watermark, scanner waits until entries are scrubbed.
Bricks for a particular volume have a set of thread(s) assigned
for scrubbing, with entries for each brick scrubbed in a round
robin fashion to avoid scrub "stalls" when a brick (out of N
bricks) is under active scrubbing.
This mechanism helps us implement "pause/resume" with ease: all
one need to do is to cleanup scrubber threads and let the main
scanner thread "wait" untill scrubbing is resumed (where the
scrubber thread(s) are spawned again), therefore continuing
where we left off (unless we restart the deamons, where crawl
initiates from root directory again, but I guess that's OK).
[
NOTE:
Throttling is optional for the signer daemon, without which
it runs full throttle. However, passing "-DBR_RATE_LIMIT_SIGNER"
predefined in CFLAGS enables CPU throttling (during checksum
calculation) thereby avoiding high CPU usage.
]
Subsequent patches would introduce CPU throttling during hash
calculation for scrubber.
Change-Id: I5701dd6cd4dff27ca3144ac5e3798a2216b39d4f
BUG: 1207020
Signed-off-by: Venky Shankar <vshankar@redhat.com>
Reviewed-on: http://review.gluster.org/10511
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r-- | libglusterfs/src/gf-dirent.c | 39 | ||||
-rw-r--r-- | libglusterfs/src/gf-dirent.h | 2 | ||||
-rw-r--r-- | libglusterfs/src/list.h | 14 | ||||
-rw-r--r-- | libglusterfs/src/syncop.c | 17 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot-scrub.c | 565 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot-scrub.h | 9 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.c | 146 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.h | 40 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h | 3 |
9 files changed, 760 insertions, 75 deletions
diff --git a/libglusterfs/src/gf-dirent.c b/libglusterfs/src/gf-dirent.c index b5f395afc36..99c0eb6441d 100644 --- a/libglusterfs/src/gf-dirent.c +++ b/libglusterfs/src/gf-dirent.c @@ -171,6 +171,20 @@ gf_dirent_for_name (const char *name) return gf_dirent; } +void +gf_dirent_entry_free (gf_dirent_t *entry) +{ + if (!entry) + return; + + if (entry->dict) + dict_unref (entry->dict); + if (entry->inode) + inode_unref (entry->inode); + + list_del (&entry->list); + GF_FREE (entry); +} void gf_dirent_free (gf_dirent_t *entries) @@ -185,16 +199,27 @@ gf_dirent_free (gf_dirent_t *entries) return; list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if (entry->dict) - dict_unref (entry->dict); - if (entry->inode) - inode_unref (entry->inode); - - list_del (&entry->list); - GF_FREE (entry); + gf_dirent_entry_free (entry); } } +gf_dirent_t * +entry_copy (gf_dirent_t *source) +{ + gf_dirent_t *sink = NULL; + + sink = gf_dirent_for_name (source->d_name); + + sink->d_off = source->d_off; + sink->d_ino = source->d_ino; + sink->d_type = source->d_type; + sink->d_stat = source->d_stat; + + if (source->inode) + sink->inode = inode_ref (source->inode); + return sink; +} + void gf_link_inode_from_dirent (xlator_t *this, inode_t *parent, gf_dirent_t *entry) { diff --git a/libglusterfs/src/gf-dirent.h b/libglusterfs/src/gf-dirent.h index 07c605f82b0..faeaf411941 100644 --- a/libglusterfs/src/gf-dirent.h +++ b/libglusterfs/src/gf-dirent.h @@ -61,6 +61,8 @@ struct _gf_dirent_t { #define DT_ISDIR(mode) (mode == DT_DIR) gf_dirent_t *gf_dirent_for_name (const char *name); +gf_dirent_t *entry_copy (gf_dirent_t *source); +void gf_dirent_entry_free (gf_dirent_t *entry); void gf_dirent_free (gf_dirent_t *entries); int gf_link_inodes_from_dirent (xlator_t *this, inode_t *parent, gf_dirent_t *entries); diff --git a/libglusterfs/src/list.h b/libglusterfs/src/list.h index 875594136a2..b8f9a6eebd8 100644 --- a/libglusterfs/src/list.h +++ b/libglusterfs/src/list.h @@ -214,6 +214,20 @@ static inline void list_replace_init(struct list_head *old, INIT_LIST_HEAD(old); } +/** + * list_rotate_left - rotate the list to the left + * @head: the head of the list + */ +static inline void list_rotate_left (struct list_head *head) +{ + struct list_head *first; + + if (!list_empty (head)) { + first = head->next; + list_move_tail (first, head); + } +} + #define list_entry(ptr, type, member) \ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c index cb08b03d44b..81eae5a9105 100644 --- a/libglusterfs/src/syncop.c +++ b/libglusterfs/src/syncop.c @@ -1217,23 +1217,6 @@ syncop_lookup (xlator_t *subvol, loc_t *loc, struct iatt *iatt, return args.op_ret; } -static gf_dirent_t * -entry_copy (gf_dirent_t *source) -{ - gf_dirent_t *sink = NULL; - - sink = gf_dirent_for_name (source->d_name); - - sink->d_off = source->d_off; - sink->d_ino = source->d_ino; - sink->d_type = source->d_type; - sink->d_stat = source->d_stat; - - if (source->inode) - sink->inode = inode_ref (source->inode); - return sink; -} - int32_t syncop_readdirp_cbk (call_frame_t *frame, void *cookie, diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c index e0581a40df0..8a80052f250 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c @@ -13,17 +13,35 @@ #include "config.h" #endif +#include <math.h> #include <ctype.h> #include <sys/uio.h> #include "glusterfs.h" -#include "xlator.h" #include "logging.h" +#include "common-utils.h" -#include "bit-rot.h" #include "bit-rot-scrub.h" #include <pthread.h> +struct br_scrubbers { + pthread_t scrubthread; + + struct list_head list; +}; + +struct br_fsscan_entry { + void *data; + + loc_t parent; + + gf_dirent_t *entry; + + struct br_scanfs *fsscan; /* backpointer to subvolume scanner */ + + struct list_head list; +}; + /** * fetch signature extended attribute from an object's fd. * NOTE: On success @xattr is not unref'd as @sign points @@ -246,8 +264,7 @@ bitd_compare_ckum (xlator_t *this, * signs with SHA256). */ int -bitd_start_scrub (xlator_t *subvol, - gf_dirent_t *entry, loc_t *parent, void *data) +br_scrubber_scrub_begin (xlator_t *this, struct br_fsscan_entry *fsentry) { int32_t ret = -1; fd_t *fd = NULL; @@ -256,17 +273,22 @@ bitd_start_scrub (xlator_t *subvol, struct iatt parent_buf = {0, }; pid_t pid = 0; br_child_t *child = NULL; - xlator_t *this = NULL; unsigned char *md = NULL; inode_t *linked_inode = NULL; br_isignature_out_t *sign = NULL; unsigned long signedversion = 0; + gf_dirent_t *entry = NULL; + loc_t *parent = NULL; - GF_VALIDATE_OR_GOTO ("bit-rot", subvol, out); - GF_VALIDATE_OR_GOTO ("bit-rot", data, out); + GF_VALIDATE_OR_GOTO ("bit-rot", fsentry, out); - child = data; - this = child->this; + entry = fsentry->entry; + parent = &fsentry->parent; + child = fsentry->data; + + GF_VALIDATE_OR_GOTO ("bit-rot", entry, out); + GF_VALIDATE_OR_GOTO ("bit-rot", parent, out); + GF_VALIDATE_OR_GOTO ("bit-rot", child, out); pid = GF_CLIENT_PID_SCRUB; @@ -366,29 +388,532 @@ bitd_start_scrub (xlator_t *subvol, return ret; } -#define BR_SCRUB_THROTTLE_COUNT 30 -#define BR_SCRUB_THROTTLE_ZZZ 60 +static void +wait_for_scrubbing (xlator_t *this, struct br_scanfs *fsscan) +{ + br_private_t *priv = NULL; + struct br_scrubber *fsscrub = NULL; + + priv = this->private; + fsscrub = &priv->fsscrub; + + pthread_mutex_lock (&fsscan->waitlock); + { + pthread_mutex_lock (&fsscrub->mutex); + { + list_replace_init (&fsscan->queued, &fsscan->ready); + + /* wake up scrubbers */ + pthread_cond_broadcast (&fsscrub->cond); + } + pthread_mutex_unlock (&fsscrub->mutex); + + while (fsscan->entries != 0) + pthread_cond_wait + (&fsscan->waitcond, &fsscan->waitlock); + } + pthread_mutex_unlock (&fsscan->waitlock); +} + +static inline void +_br_fsscan_inc_entry_count (struct br_scanfs *fsscan) +{ + fsscan->entries++; +} + +static inline void +_br_fsscan_dec_entry_count (struct br_scanfs *fsscan) +{ + if (--fsscan->entries == 0) { + pthread_mutex_lock (&fsscan->waitlock); + { + pthread_cond_signal (&fsscan->waitcond); + } + pthread_mutex_unlock (&fsscan->waitlock); + } +} + +static void +_br_fsscan_collect_entry (struct br_scanfs *fsscan, + struct br_fsscan_entry *fsentry) +{ + list_add_tail (&fsentry->list, &fsscan->queued); + _br_fsscan_inc_entry_count (fsscan); +} + +#define NR_ENTRIES (1<<7) /* ..bulk scrubbing */ + +int +br_fsscanner_handle_entry (xlator_t *subvol, + gf_dirent_t *entry, loc_t *parent, void *data) +{ + int32_t ret = -1; + int scrub = 0; + br_child_t *child = NULL; + xlator_t *this = NULL; + struct br_scanfs *fsscan = NULL; + struct br_fsscan_entry *fsentry = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot", subvol, error_return); + GF_VALIDATE_OR_GOTO ("bit-rot", data, error_return); + + child = data; + this = child->this; + fsscan = &child->fsscan; + + fsentry = GF_CALLOC (1, sizeof (*fsentry), gf_br_mt_br_fsscan_entry_t); + if (!fsentry) + goto error_return; + + { + fsentry->data = data; + fsentry->fsscan = &child->fsscan; + + /* copy parent loc */ + ret = loc_copy (&fsentry->parent, parent); + if (ret) + goto dealloc; + + /* copy child entry */ + fsentry->entry = entry_copy (entry); + if (!fsentry->entry) + goto locwipe; + + INIT_LIST_HEAD (&fsentry->list); + } + + LOCK (&fsscan->entrylock); + { + _br_fsscan_collect_entry (fsscan, fsentry); + + /** + * need not be a equality check as entries may be pushed + * back onto the scanned queue when thread(s) are cleaned. + */ + if (fsscan->entries >= NR_ENTRIES) + scrub = 1; + } + UNLOCK (&fsscan->entrylock); + + if (scrub) + wait_for_scrubbing (this, fsscan); + + return 0; + + locwipe: + loc_wipe (&fsentry->parent); + dealloc: + GF_FREE (fsentry); + error_return: + return -1; +} + void * -br_scrubber (void *arg) +br_fsscanner (void *arg) { - loc_t loc = {0,}; - xlator_t *this = NULL; - br_child_t *child = NULL; + loc_t loc = {0,}; + xlator_t *this = NULL; + br_child_t *child = NULL; + struct br_scanfs *fsscan = NULL; child = arg; this = child->this; + fsscan = &child->fsscan; THIS = this; loc.inode = child->table->root; while (1) { - (void) syncop_ftw_throttle - (child->xl, &loc, - GF_CLIENT_PID_SCRUB, child, bitd_start_scrub, - BR_SCRUB_THROTTLE_COUNT, BR_SCRUB_THROTTLE_ZZZ); + (void) syncop_ftw (child->xl, &loc, + GF_CLIENT_PID_SCRUB, + child, br_fsscanner_handle_entry); + if (!list_empty (&fsscan->queued)) + wait_for_scrubbing (this, fsscan); + } + + return NULL; +} + +#define BR_SCRUB_THREAD_SCALE_LAZY 0 +#define BR_SCRUB_THREAD_SCALE_NORMAL 0.4 +#define BR_SCRUB_THREAD_SCALE_AGGRESSIVE 1.0 + +#ifndef M_E +#define M_E 2.718 +#endif + +/** + * This is just a simple exponential scale to a fixed value selected + * per throttle config. We probably need to be more smart and select + * the scale based on the number of processor cores too. + */ +static unsigned int +br_scrubber_calc_scale (xlator_t *this, + br_private_t *priv, scrub_throttle_t throttle) +{ + unsigned int scale = 0; + + switch (throttle) { + case BR_SCRUB_THROTTLE_VOID: + scale = 0; + break; + case BR_SCRUB_THROTTLE_LAZY: + scale = priv->child_count * + pow (M_E, BR_SCRUB_THREAD_SCALE_LAZY); + break; + case BR_SCRUB_THROTTLE_NORMAL: + scale = priv->child_count * + pow (M_E, BR_SCRUB_THREAD_SCALE_NORMAL); + break; + case BR_SCRUB_THROTTLE_AGGRESSIVE: + scale = priv->child_count * + pow (M_E, BR_SCRUB_THREAD_SCALE_AGGRESSIVE); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "Unknown throttle %d", throttle); + } + + return scale; + +} + +static void +br_scrubber_cleanup_handler (void *arg) +{ + struct br_scrubber *fsscrub = arg; + pthread_mutex_unlock (&fsscrub->mutex); +} + +static inline br_child_t * +_br_scrubber_get_next_child (struct br_scrubber *fsscrub) +{ + br_child_t *child = NULL; + + child = list_first_entry (&fsscrub->scrublist, br_child_t, list); + list_rotate_left (&fsscrub->scrublist); + + return child; +} + +static inline void +_br_scrubber_get_entry (br_child_t *child, struct br_fsscan_entry **fsentry) +{ + struct br_scanfs *fsscan = &child->fsscan; + + if (list_empty (&fsscan->ready)) + return; + *fsentry = list_first_entry + (&fsscan->ready, struct br_fsscan_entry, list); + list_del_init (&(*fsentry)->list); +} + +static inline void +_br_scrubber_find_scrubbable_entry (struct br_scrubber *fsscrub, + struct br_fsscan_entry **fsentry) +{ + br_child_t *child = NULL; + br_child_t *firstchild = NULL; + + while (1) { + if (list_empty (&fsscrub->scrublist)) + pthread_cond_wait (&fsscrub->cond, &fsscrub->mutex); + + firstchild = NULL; + for (child = _br_scrubber_get_next_child (fsscrub); + child != firstchild; + child = _br_scrubber_get_next_child (fsscrub)) { + + if (!firstchild) + firstchild = child; + + _br_scrubber_get_entry (child, fsentry); + if (*fsentry) + break; + } + + if (*fsentry) + break; + + /* nothing to work on.. wait till available */ + pthread_cond_wait (&fsscrub->cond, &fsscrub->mutex); + } +} + +static void +br_scrubber_pick_entry (struct br_scrubber *fsscrub, + struct br_fsscan_entry **fsentry) +{ + pthread_cleanup_push (br_scrubber_cleanup_handler, fsscrub); + + pthread_mutex_lock (&fsscrub->mutex); + { + *fsentry = NULL; + _br_scrubber_find_scrubbable_entry (fsscrub, fsentry); + } + pthread_mutex_unlock (&fsscrub->mutex); + + pthread_cleanup_pop (0); +} - sleep (BR_SCRUB_THROTTLE_ZZZ); +struct br_scrub_entry { + gf_boolean_t scrubbed; + struct br_fsscan_entry *fsentry; +}; + +/** + * We need to be a bit careful here. These thread(s) are prone to cancellations + * when threads are scaled down (depending on the thottling value configured) + * and pausing scrub. A thread can get cancelled while it's waiting for entries + * in the ->pending queue or when an object is undergoing scrubbing. + */ +static void +br_scrubber_entry_handle (void *arg) +{ + struct br_scanfs *fsscan = NULL; + struct br_scrub_entry *sentry = NULL; + struct br_fsscan_entry *fsentry = NULL; + + sentry = arg; + + fsentry = sentry->fsentry; + fsscan = fsentry->fsscan; + + LOCK (&fsscan->entrylock); + { + if (sentry->scrubbed) { + _br_fsscan_dec_entry_count (fsscan); + + /* cleanup ->entry */ + fsentry->data = NULL; + fsentry->fsscan = NULL; + loc_wipe (&fsentry->parent); + gf_dirent_entry_free (fsentry->entry); + + GF_FREE (sentry->fsentry); + } else { + /* (re)queue the entry again for scrub */ + _br_fsscan_collect_entry (fsscan, sentry->fsentry); + } + } + UNLOCK (&fsscan->entrylock); +} + +static void +br_scrubber_scrub_entry (xlator_t *this, struct br_fsscan_entry *fsentry) +{ + struct br_scrub_entry sentry = {0, }; + + sentry.scrubbed = 0; + sentry.fsentry = fsentry; + + pthread_cleanup_push (br_scrubber_entry_handle, &sentry); + { + (void) br_scrubber_scrub_begin (this, fsentry); + sentry.scrubbed = 1; + } + pthread_cleanup_pop (1); +} + +void *br_scrubber_proc (void *arg) +{ + xlator_t *this = NULL; + struct br_scrubber *fsscrub = NULL; + struct br_fsscan_entry *fsentry = NULL; + + fsscrub = arg; + THIS = this = fsscrub->this; + + while (1) { + br_scrubber_pick_entry (fsscrub, &fsentry); + br_scrubber_scrub_entry (this, fsentry); + sleep (1); } return NULL; } + +static int32_t +br_scrubber_scale_up (xlator_t *this, + struct br_scrubber *fsscrub, + unsigned int v1, unsigned int v2) +{ + int i = 0; + int32_t ret = -1; + int diff = 0; + struct br_scrubbers *scrub = NULL; + + diff = (int)(v2 - v1); + + gf_log (this->name, GF_LOG_INFO, + "Scaling up scrubbers [%d => %d]", v1, v2); + + for (i = 0; i < diff; i++) { + scrub = GF_CALLOC (diff, sizeof (*scrub), + gf_br_mt_br_scrubber_t); + if (!scrub) + break; + + INIT_LIST_HEAD (&scrub->list); + ret = gf_thread_create (&scrub->scrubthread, + NULL, br_scrubber_proc, fsscrub); + if (ret) + break; + + fsscrub->nr_scrubbers++; + list_add_tail (&scrub->list, &fsscrub->scrubbers); + } + + if ((i != diff) && !scrub) + goto error_return; + + if (i != diff) /* degraded scaling.. */ + gf_log (this->name, GF_LOG_WARNING, + "Could not fully scale up to %d scrubber(s). Spawned " + "%d/%d [total scrubber(s): %d]", v2, i, diff, (v1 + i)); + + return 0; + + error_return: + return -1; +} + +static int32_t +br_scrubber_scale_down (xlator_t *this, + struct br_scrubber *fsscrub, + unsigned int v1, unsigned int v2) +{ + int i = 0; + int diff = 0; + int32_t ret = -1; + struct br_scrubbers *scrub = NULL; + + diff = (int)(v1 - v2); + + gf_log (this->name, GF_LOG_INFO, + "Scaling down scrubbers [%d => %d]", v1, v2); + + for (i = 0 ; i < diff; i++) { + scrub = list_first_entry + (&fsscrub->scrubbers, struct br_scrubbers, list); + + list_del_init (&scrub->list); + ret = gf_thread_cleanup_xint (scrub->scrubthread); + if (ret) + break; + GF_FREE (scrub); + + fsscrub->nr_scrubbers--; + } + + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Could not fully scale down to %d scrubber(s). " + "Terminated %d/%d [total scrubber(s): %d]", + v1, i, diff, (v2 - i)); + ret = 0; + } + + return ret; +} + +static int32_t +br_scrubber_configure (xlator_t *this, br_private_t *priv, + struct br_scrubber *fsscrub, scrub_throttle_t nthrottle) +{ + int32_t ret = 0; + unsigned int v1 = 0; + unsigned int v2 = 0; + + v1 = fsscrub->nr_scrubbers; + v2 = br_scrubber_calc_scale (this, priv, nthrottle); + + if (v1 == v2) + return 0; + + if (v1 > v2) + ret = br_scrubber_scale_down (this, fsscrub, v1, v2); + else + ret = br_scrubber_scale_up (this, fsscrub, v1, v2); + + return ret; +} + +/* TODO: token buket spec */ +static int32_t +br_scrubber_handle_throttle (xlator_t *this, + br_private_t *priv, dict_t *options) +{ + int32_t ret = 0; + char *tmp = NULL; + struct br_scrubber *fsscrub = NULL; + scrub_throttle_t nthrottle = BR_SCRUB_THROTTLE_VOID; + + fsscrub = &priv->fsscrub; + + if (options) + GF_OPTION_RECONF ("scrub-throttle", + tmp, options, str, error_return); + else + GF_OPTION_INIT ("scrub-throttle", tmp, str, error_return); + + if (strcasecmp (tmp, "lazy") == 0) + nthrottle = BR_SCRUB_THROTTLE_LAZY; + else if (strcasecmp (tmp, "normal") == 0) + nthrottle = BR_SCRUB_THROTTLE_NORMAL; + else if (strcasecmp (tmp, "aggressive") == 0) + nthrottle = BR_SCRUB_THROTTLE_AGGRESSIVE; + else + goto error_return; + + /* on failure old throttling value is preserved */ + ret = br_scrubber_configure (this, priv, fsscrub, nthrottle); + if (ret) + goto error_return; + + fsscrub->throttle = nthrottle; + return 0; + + error_return: + return -1; +} + +/* TODO: pause/resume, frequency */ +int32_t +br_scrubber_handle_options (xlator_t *this, br_private_t *priv, dict_t *options) +{ + int32_t ret = 0; + + ret = br_scrubber_handle_throttle (this, priv, options); + if (ret) + goto error_return; + + return 0; + + error_return: + return -1; +} + +int32_t +br_scrubber_init (xlator_t *this, br_private_t *priv) +{ + struct br_scrubber *fsscrub = NULL; + + priv->tbf = br_tbf_init (NULL, 0); + if (!priv->tbf) + return -1; + + fsscrub = &priv->fsscrub; + + fsscrub->this = this; + fsscrub->throttle = BR_SCRUB_THROTTLE_VOID; + + pthread_mutex_init (&fsscrub->mutex, NULL); + pthread_cond_init (&fsscrub->cond, NULL); + + fsscrub->nr_scrubbers = 0; + INIT_LIST_HEAD (&fsscrub->scrubbers); + INIT_LIST_HEAD (&fsscrub->scrublist); + + return 0; +} diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h index daec9ad8196..4f00020d66a 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h +++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h @@ -11,6 +11,13 @@ #ifndef __BIT_ROT__SCRUB_H__ #define __BIT_ROT_SCRUB_H__ -void *br_scrubber (void *); +#include "xlator.h" +#include "bit-rot.h" + +void *br_fsscanner (void *); + +int32_t br_scrubber_handle_options (xlator_t *, br_private_t *, dict_t *); + +int32_t br_scrubber_init (xlator_t *, br_private_t *); #endif /* __BIT_ROT_SCRUB_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c index 880b16edfa8..eea81aec53a 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.c +++ b/xlators/features/bit-rot/src/bitd/bit-rot.c @@ -29,15 +29,6 @@ #define BR_HASH_CALC_READ_SIZE (128 * 1024) -br_tbf_opspec_t opthrottle[] = { - { - .op = BR_TBF_OP_HASH, - .rate = BR_HASH_CALC_READ_SIZE, - .maxlimit = (2 * BR_WORKERS * BR_HASH_CALC_READ_SIZE), - }, - /** TODO: throttle getdents(), read() request(s) */ -}; - static int br_find_child_index (xlator_t *this, xlator_t *child) { @@ -1066,6 +1057,7 @@ br_enact_signer (xlator_t *this, br_child_t *child, br_stub_init_t *stub) child->threadrunning = 1; /* it's OK to continue, "old" objects would be signed when modified */ + list_del_init (&child->list); return 0; dealloc: @@ -1078,14 +1070,45 @@ static inline int32_t br_enact_scrubber (xlator_t *this, br_child_t *child) { int32_t ret = 0; + br_private_t *priv = NULL; + struct br_scanfs *fsscan = NULL; + struct br_scrubber *fsscrub = NULL; + + priv = this->private; + + fsscan = &child->fsscan; + fsscrub = &priv->fsscrub; + + LOCK_INIT (&fsscan->entrylock); + pthread_mutex_init (&fsscan->waitlock, NULL); + pthread_cond_init (&fsscan->waitcond, NULL); - ret = gf_thread_create (&child->thread, NULL, br_scrubber, child); + fsscan->entries = 0; + INIT_LIST_HEAD (&fsscan->queued); + INIT_LIST_HEAD (&fsscan->ready); + + ret = gf_thread_create (&child->thread, NULL, br_fsscanner, child); if (ret != 0) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, "failed to spawn scrubber"); + gf_log (this->name, GF_LOG_ALERT, "failed to spawn bitrot " + "scrubber daemon [Brick: %s]", child->brick_path); + goto error_return; } - return ret; + /** + * Everything has been setup.. add this subvolume to scrubbers + * list. + */ + pthread_mutex_lock (&fsscrub->mutex); + { + list_move (&child->list, &fsscrub->scrublist); + pthread_cond_broadcast (&fsscrub->cond); + } + pthread_mutex_unlock (&fsscrub->mutex); + + return 0; + + error_return: + return -1; } /** @@ -1202,8 +1225,7 @@ br_handle_events (void *arg) "failed to connect to the " "child (subvolume: %s)", child->xl->name); - else - list_del_init (&child->list); + } } @@ -1379,16 +1401,72 @@ br_init_signer (xlator_t *this, br_private_t *priv) return -1; } -int32_t -br_init_rate_limiter (br_private_t *priv) +/** + * For signer, only rate limit CPU usage (during hash calculation) when + * compiled with -DBR_RATE_LIMIT_SIGNER cflags, else let it run full + * throttle. + */ +static int32_t +br_rate_limit_signer (xlator_t *this, int child_count, int numbricks) { - br_tbf_opspec_t *spec = opthrottle; - priv->tbf = br_tbf_init (spec, sizeof (opthrottle) - / sizeof (br_tbf_opspec_t)); + br_private_t *priv = NULL; + br_tbf_opspec_t spec = {0,}; + + priv = this->private; + + spec.op = BR_TBF_OP_HASH; + spec.rate = 0; + spec.maxlimit = 0; + +#ifdef BR_RATE_LIMIT_SIGNER + + double contribution = 0; + contribution = ((double)1 - ((double)child_count / (double)numbricks)); + if (contribution == 0) + contribution = 1; + spec.rate = BR_HASH_CALC_READ_SIZE * contribution; + spec.maxlimit = BR_WORKERS * BR_HASH_CALC_READ_SIZE; + +#endif + + if (!spec.rate) + gf_log (this->name, + GF_LOG_INFO, "[Rate Limit Info] \"FULL THROTTLE\""); + else + gf_log (this->name, GF_LOG_INFO, + "[Rate Limit Info] \"tokens/sec (rate): %lu, " + "maxlimit: %lu\"", spec.rate, spec.maxlimit); + priv->tbf = br_tbf_init (&spec, 1); return priv->tbf ? 0 : -1; } +static int32_t +br_signer_init (xlator_t *this, br_private_t *priv) +{ + int32_t ret = 0; + int numbricks = 0; + + GF_OPTION_INIT ("expiry-time", priv->expiry_time, int32, error_return); + GF_OPTION_INIT ("brick-count", numbricks, int32, error_return); + + ret = br_rate_limit_signer (this, priv->child_count, numbricks); + if (ret) + goto error_return; + + ret = br_init_signer (this, priv); + if (ret) + goto cleanup_tbf; + + return 0; + + cleanup_tbf: + /* cleanup TBF */ + error_return: + return -1; + +} + int32_t init (xlator_t *this) { @@ -1410,7 +1488,6 @@ init (xlator_t *this) } GF_OPTION_INIT ("scrubber", priv->iamscrubber, bool, out); - GF_OPTION_INIT ("expiry-time", priv->expiry_time, int32, out); priv->child_count = xlator_subvolume_count (this); priv->children = GF_CALLOC (priv->child_count, sizeof (*priv->children), @@ -1443,18 +1520,19 @@ init (xlator_t *this) INIT_LIST_HEAD (&priv->children[i].list); INIT_LIST_HEAD (&priv->bricks); - ret = br_init_rate_limiter (priv); - if (ret) - goto cleanup_mutex; - this->private = priv; if (!priv->iamscrubber) { - ret = br_init_signer (this, priv); - if (ret) - goto cleanup_tbf; + ret = br_signer_init (this, priv); + } else { + ret = br_scrubber_init (this, priv); + if (!ret) + ret = br_scrubber_handle_options (this, priv, NULL); } + if (ret) + goto cleanup_mutex; + ret = gf_thread_create (&priv->thread, NULL, br_handle_events, this); if (ret != 0) { gf_log (this->name, GF_LOG_ERROR, @@ -1469,7 +1547,6 @@ init (xlator_t *this) return 0; } - cleanup_tbf: cleanup_mutex: (void) pthread_cond_destroy (&priv->cond); (void) pthread_mutex_destroy (&priv->lock); @@ -1505,6 +1582,17 @@ fini (xlator_t *this) return; } +int +reconfigure (xlator_t *this, dict_t *options) +{ + br_private_t *priv = this->private; + + if (!priv->iamscrubber) + return 0; + + return br_scrubber_handle_options (this, priv, options); +} + struct xlator_fops fops; struct xlator_cbks cbks; diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h index 5b641801916..6f21a6985ba 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.h +++ b/xlators/features/bit-rot/src/bitd/bit-rot.h @@ -38,8 +38,26 @@ */ #define BR_WORKERS 4 +typedef enum scrub_throttle { + BR_SCRUB_THROTTLE_VOID = -1, + BR_SCRUB_THROTTLE_LAZY = 0, + BR_SCRUB_THROTTLE_NORMAL = 1, + BR_SCRUB_THROTTLE_AGGRESSIVE = 2, +} scrub_throttle_t; + #define signature_size(hl) (sizeof (br_isignature_t) + hl + 1) +struct br_scanfs { + gf_lock_t entrylock; + + pthread_mutex_t waitlock; + pthread_cond_t waitcond; + + unsigned int entries; + struct list_head queued; + struct list_head ready; +}; + struct br_child { char child_up; /* Indicates whether this child is up or not */ @@ -53,12 +71,14 @@ struct br_child { xlator_t *this; /* Bit rot xlator */ pthread_t thread; /* initial crawler for unsigned - object(s) */ + object(s) or scrub crawler */ int threadrunning; /* active thread */ struct mem_pool *timer_pool; /* timer-wheel's timer mem-pool */ struct timeval tv; + + struct br_scanfs fsscan; /* per subvolume FS scanner */ }; typedef struct br_child br_child_t; @@ -72,6 +92,23 @@ struct br_obj_n_workers { signing each object */ }; +struct br_scrubber { + xlator_t *this; + + scrub_throttle_t throttle; + + pthread_mutex_t mutex; + pthread_cond_t cond; + + unsigned int nr_scrubbers; + struct list_head scrubbers; + + /* + * list of "rotatable" subvolume(s) undergoing scrubbing + */ + struct list_head scrublist; +}; + typedef struct br_obj_n_workers br_obj_n_workers_t; struct br_private { @@ -100,6 +137,7 @@ struct br_private { br_tbf_t *tbf; /* token bucket filter */ gf_boolean_t iamscrubber; /* function as a fs scrubber */ + struct br_scrubber fsscrub; /* scrubbers for this subvolume */ }; typedef struct br_private br_private_t; diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h index bb4030493db..46271407219 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -25,6 +25,9 @@ enum br_mem_types { gf_br_mt_br_tbf_t, gf_br_mt_br_tbf_bucket_t, gf_br_mt_br_tbf_throttle_t, + gf_br_mt_br_tbf_opspec_t, + gf_br_mt_br_scrubber_t, + gf_br_mt_br_fsscan_entry_t, gf_br_stub_mt_end }; |