summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVenky Shankar <vshankar@redhat.com>2015-02-03 19:05:28 +0530
committerVijay Bellur <vbellur@redhat.com>2015-03-18 22:05:51 -0700
commit5394f3cf60b0815d2919d24e9945ba47e3bb1f9b (patch)
tree8c3aedc4c5b3777c693c6ffbdcf0aa1150580669
parent2bf23b5b9be9ad600d71474a8f94b62765344b7c (diff)
contrib/timer-wheel: import linux kernel timer-wheel
This patch imports timer-wheel[1] algorithm from the linux kernel (~/kernel/time/timer.c) with some modifications. Timer-wheel is an efficent way to track millions of timers for expiry. This is a variant of the simple but RAM heavy approach of having a list (timer bucket) for every future second. Timer-wheel categorizes every future second into a logarithmic array of arrays. This is done by splitting the 32 bit "timeout" value into fixed "sliced" bits, thereby each category has a fixed size array to which buckets are assigned. A classic split would be 8+6+6+6 (used in this patch) which results in 256+64+64+64 == 512 buckets. Therefore, the entire 32 bit futuristic timeouts have been mapped into 512 buckets. [ NOTE: There are other possible splits, such as "8+8+8+8", but this patch sticks to the widely used and tested default. ] Therfore, the first category "holds" timers whose expiry range is between 1..256, the next cateogry holds 257..16384, third category 16385..1048576 and so on. When timers are added, unless it's in the first category, timers with different timeouts could end up in the same bucket. This means that the timers are "partially sorted" -- sorted in their highest bits. The expiry code walks the first array of buckets and exprires any pending timers (1..256). Next, at time value 257, timers in the first bucket of the second array is "cascaded" onto the first category and timers are placed into respective buckets according to the thier timeout values. Cascading "brings down" the timers timeout to the coorect bucket of their respective category. Therefore, timers are sorted by their highest bits of the timeout value and then by the lower bits too. [1] https://lwn.net/Articles/152436/ Change-Id: I1219abf69290961ae9a3d483e11c107c5f49c4e3 BUG: 1170075 Signed-off-by: Venky Shankar <vshankar@redhat.com> Reviewed-on: http://review.gluster.org/9707 Reviewed-by: Vijay Bellur <vbellur@redhat.com> Tested-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--contrib/timer-wheel/find_last_bit.c82
-rw-r--r--contrib/timer-wheel/timer-wheel.c264
-rw-r--r--contrib/timer-wheel/timer-wheel.h71
-rw-r--r--libglusterfs/src/Makefile.am6
-rw-r--r--libglusterfs/src/list.h23
5 files changed, 444 insertions, 2 deletions
diff --git a/contrib/timer-wheel/find_last_bit.c b/contrib/timer-wheel/find_last_bit.c
new file mode 100644
index 00000000000..054e90a076f
--- /dev/null
+++ b/contrib/timer-wheel/find_last_bit.c
@@ -0,0 +1,82 @@
+/*
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+/**
+ * @find_last_bit
+ * optimized implementation of find last bit in
+ */
+
+#ifndef BITS_PER_LONG
+#define BITS_PER_LONG 64
+#endif
+
+static inline int fls(int x)
+{
+ int r = 32;
+
+ if (!x)
+ return 0;
+ if (!(x & 0xffff0000u)) {
+ x <<= 16;
+ r -= 16;
+ }
+ if (!(x & 0xff000000u)) {
+ x <<= 8;
+ r -= 8;
+ }
+ if (!(x & 0xf0000000u)) {
+ x <<= 4;
+ r -= 4;
+ }
+ if (!(x & 0xc0000000u)) {
+ x <<= 2;
+ r -= 2;
+ }
+ if (!(x & 0x80000000u)) {
+ x <<= 1;
+ r -= 1;
+ }
+ return r;
+}
+
+
+unsigned long gf_tw_find_last_bit(const unsigned long *addr, unsigned long size)
+{
+ unsigned long words;
+ unsigned long tmp;
+
+ /* Start at final word. */
+ words = size / BITS_PER_LONG;
+
+ /* Partial final word? */
+ if (size & (BITS_PER_LONG-1)) {
+ tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
+ - (size & (BITS_PER_LONG-1)))));
+ if (tmp)
+ goto found;
+ }
+
+ while (words) {
+ tmp = addr[--words];
+ if (tmp) {
+found:
+ return words * BITS_PER_LONG + fls(tmp);
+ }
+ }
+
+ /* Not found */
+ return size;
+}
diff --git a/contrib/timer-wheel/timer-wheel.c b/contrib/timer-wheel/timer-wheel.c
new file mode 100644
index 00000000000..e80d83992bf
--- /dev/null
+++ b/contrib/timer-wheel/timer-wheel.c
@@ -0,0 +1,264 @@
+/*
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "timer-wheel.h"
+
+static inline void
+__gf_tw_add_timer (struct tvec_base *base, struct gf_tw_timer_list *timer)
+{
+ int i;
+ unsigned long idx;
+ unsigned long expires;
+ struct list_head *vec;
+
+ expires = timer->expires;
+
+ idx = expires - base->timer_sec;
+
+ if (idx < TVR_SIZE) {
+ i = expires & TVR_MASK;
+ vec = base->tv1.vec + i;
+ } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+ i = (expires >> TVR_BITS) & TVN_MASK;
+ vec = base->tv2.vec + i;
+ } else if (idx < 1 << (TVR_BITS + 2*TVN_BITS)) {
+ i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+ vec = base->tv3.vec + i;
+ } else if (idx < 1 << (TVR_BITS + 3*TVN_BITS)) {
+ i = (expires >> (TVR_BITS + 2*TVN_BITS)) & TVN_MASK;
+ vec = base->tv4.vec + i;
+ } else if (idx < 0) {
+ vec = base->tv1.vec + (base->timer_sec & TVR_MASK);
+ } else {
+ i = (expires >> (TVR_BITS + 3*TVN_BITS)) & TVN_MASK;
+ vec = base->tv5.vec + i;
+ }
+
+ list_add_tail (&timer->entry, vec);
+}
+
+/* optimized find_last_bit() */
+unsigned long gf_tw_find_last_bit(const unsigned long *, unsigned long);
+
+static inline unsigned long
+apply_slack(struct tvec_base *base, struct gf_tw_timer_list *timer)
+{
+ long delta;
+ unsigned long mask, expires, expires_limit;
+
+ expires = timer->expires;
+
+ delta = expires - base->timer_sec;
+ if (delta < 256)
+ return expires;
+
+ expires_limit = expires + delta / 256;
+ mask = expires ^ expires_limit;
+ if (mask == 0)
+ return expires;
+
+ int bit = gf_tw_find_last_bit (&mask, BITS_PER_LONG);
+ mask = (1UL << bit) - 1;
+
+ expires_limit = expires_limit & ~(mask);
+ return expires_limit;
+}
+
+static inline void
+gf_tw_detach_timer (struct gf_tw_timer_list *timer)
+{
+ struct list_head *entry = &timer->entry;
+
+ list_del (entry);
+}
+
+static inline int
+cascade (struct tvec_base *base, struct tvec *tv, int index)
+{
+ struct gf_tw_timer_list *timer, *tmp;
+ struct list_head tv_list;
+
+ list_replace_init (tv->vec + index, &tv_list);
+
+ list_for_each_entry_safe (timer, tmp, &tv_list, entry) {
+ __gf_tw_add_timer (base, timer);
+ }
+
+ return index;
+}
+
+#define INDEX(N) ((base->timer_sec >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK)
+
+/**
+ * run expired timers
+ */
+static inline void
+run_timers (struct tvec_base *base)
+{
+ unsigned long index, call_time;
+ struct gf_tw_timer_list *timer;
+
+ struct list_head work_list;
+ struct list_head *head = &work_list;
+
+ pthread_spin_lock (&base->lock);
+ {
+ index = base->timer_sec & TVR_MASK;
+
+ if (!index &&
+ (!cascade (base, &base->tv2, INDEX(0))) &&
+ (!cascade (base, &base->tv3, INDEX(1))) &&
+ (!cascade (base, &base->tv4, INDEX(2))))
+ cascade (base, &base->tv5, INDEX(3));
+
+ call_time = base->timer_sec++;
+ list_replace_init (base->tv1.vec + index, head);
+ while (!list_empty(head)) {
+ void (*fn)(struct gf_tw_timer_list *, void *, unsigned long);
+ void *data;
+
+ timer = list_first_entry (head, struct gf_tw_timer_list, entry);
+ fn = timer->function;
+ data = timer->data;
+
+ gf_tw_detach_timer (timer);
+ fn (timer, data, call_time);
+ }
+ }
+ pthread_spin_unlock (&base->lock);
+
+}
+
+void *runner (void *arg)
+{
+ struct timeval tv = {0,};
+ struct tvec_base *base = arg;
+
+ while (1) {
+ run_timers (base);
+
+ tv.tv_sec = 1;
+ tv.tv_usec = 0;
+ (void) select (0, NULL, NULL, NULL, &tv);
+ }
+
+ return NULL;
+
+}
+
+/* interface */
+
+/**
+ * Add a timer in the timer wheel
+ */
+void gf_tw_add_timer (struct tvec_base *base, struct gf_tw_timer_list *timer)
+{
+ pthread_spin_lock (&base->lock);
+ {
+ timer->expires += base->timer_sec;
+ timer->expires = apply_slack (base, timer);
+ __gf_tw_add_timer (base, timer);
+ }
+ pthread_spin_unlock (&base->lock);
+}
+
+/**
+ * Remove a timer from the timer wheel
+ */
+void gf_tw_del_timer (struct gf_tw_timer_list *timer)
+{
+ gf_tw_detach_timer (timer);
+}
+
+int gf_tw_cleanup_timers (struct tvec_base *base)
+{
+ int ret = 0;
+ void *res = NULL;
+
+ /* terminate runner */
+ ret = pthread_cancel (base->runner);
+ if (ret != 0)
+ goto error_return;
+ ret = pthread_join (base->runner, &res);
+ if (ret != 0)
+ goto error_return;
+ if (res != PTHREAD_CANCELED)
+ goto error_return;
+
+ /* destroy lock */
+ ret = pthread_spin_destroy (&base->lock);
+ if (ret != 0)
+ goto error_return;
+
+ /* deallocated timer base */
+ free (base);
+ return 0;
+
+ error_return:
+ return -1;
+}
+
+/**
+ * Initialize various timer wheel lists and spawn a thread that
+ * invokes run_timers()
+ */
+struct tvec_base *gf_tw_init_timers ()
+{
+ int j = 0;
+ int ret = 0;
+ struct timeval tv = {0,};
+ struct tvec_base *base = NULL;
+
+ base = malloc (sizeof (*base));
+ if (!base)
+ goto error_return;
+
+ ret = pthread_spin_init (&base->lock, 0);
+ if (ret != 0)
+ goto error_dealloc;
+
+ for (j = 0; j < TVN_SIZE; j++) {
+ INIT_LIST_HEAD (base->tv5.vec + j);
+ INIT_LIST_HEAD (base->tv4.vec + j);
+ INIT_LIST_HEAD (base->tv3.vec + j);
+ INIT_LIST_HEAD (base->tv2.vec + j);
+ }
+
+ for (j = 0; j < TVR_SIZE; j++) {
+ INIT_LIST_HEAD (base->tv1.vec + j);
+ }
+
+ ret = gettimeofday (&tv, 0);
+ if (ret < 0)
+ goto destroy_lock;
+ base->timer_sec = tv.tv_sec;
+
+ ret = pthread_create (&base->runner, NULL, runner, base);
+ if (ret != 0)
+ goto destroy_lock;
+ return base;
+
+ destroy_lock:
+ (void) pthread_spin_destroy (&base->lock);
+ error_dealloc:
+ free (base);
+ error_return:
+ return NULL;
+}
diff --git a/contrib/timer-wheel/timer-wheel.h b/contrib/timer-wheel/timer-wheel.h
new file mode 100644
index 00000000000..74b8dfdff5e
--- /dev/null
+++ b/contrib/timer-wheel/timer-wheel.h
@@ -0,0 +1,71 @@
+/*
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+#ifndef __TIMER_WHEEL_H
+#define __TIMER_WHEEL_H
+
+#include <pthread.h>
+
+#include "list.h"
+
+#define TVR_BITS 8
+#define TVN_BITS 6
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_MASK (TVR_SIZE - 1)
+#define TVN_MASK (TVN_SIZE - 1)
+
+#define BITS_PER_LONG 64
+
+struct tvec {
+ struct list_head vec[TVN_SIZE];
+};
+
+struct tvec_root {
+ struct list_head vec[TVR_SIZE];
+};
+
+struct tvec_base {
+ pthread_spinlock_t lock; /* base lock */
+
+ pthread_t runner; /* run_timer() */
+
+ unsigned long timer_sec; /* time counter */
+
+ struct tvec_root tv1;
+ struct tvec tv2;
+ struct tvec tv3;
+ struct tvec tv4;
+ struct tvec tv5;
+};
+
+struct gf_tw_timer_list {
+ void *data;
+ unsigned long expires;
+
+ /** callback routine */
+ void (*function)(struct gf_tw_timer_list *, void *, unsigned long);
+
+ struct list_head entry;
+};
+
+/** The API! */
+struct tvec_base *gf_tw_init_timers ();
+int gf_tw_cleanup_timers (struct tvec_base *);
+void gf_tw_add_timer (struct tvec_base *, struct gf_tw_timer_list *);
+void gf_tw_del_timer (struct gf_tw_timer_list *);
+
+#endif
diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am
index 2441023ad39..818de91cf36 100644
--- a/libglusterfs/src/Makefile.am
+++ b/libglusterfs/src/Makefile.am
@@ -30,7 +30,8 @@ libglusterfs_la_SOURCES = dict.c xlator.c logging.c \
$(CONTRIBDIR)/libgen/basename_r.c $(CONTRIBDIR)/libgen/dirname_r.c \
$(CONTRIBDIR)/stdlib/gf_mkostemp.c strfd.c parse-utils.c \
$(CONTRIBDIR)/mount/mntent.c $(CONTRIBDIR)/libexecinfo/execinfo.c\
- quota-common-utils.c rot-buffs.c
+ quota-common-utils.c rot-buffs.c $(CONTRIBDIR)/timer-wheel/timer-wheel.c \
+ $(CONTRIBDIR)/timer-wheel/find_last_bit.c
nodist_libglusterfs_la_SOURCES = y.tab.c graph.lex.c
@@ -49,7 +50,8 @@ noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h timespec.
template-component-messages.h strfd.h syncop-utils.h parse-utils.h \
$(CONTRIBDIR)/mount/mntent_compat.h lvm-defaults.h \
$(CONTRIBDIR)/libexecinfo/execinfo_compat.h \
- unittest/unittest.h quota-common-utils.h rot-buffs.h
+ unittest/unittest.h quota-common-utils.h rot-buffs.h \
+ $(CONTRIBDIR)/timer-wheel/timer-wheel.h
EXTRA_DIST = graph.l graph.y
diff --git a/libglusterfs/src/list.h b/libglusterfs/src/list.h
index 894fa3012cf..a860275a91e 100644
--- a/libglusterfs/src/list.h
+++ b/libglusterfs/src/list.h
@@ -191,6 +191,29 @@ list_is_singular(struct list_head *head)
return !list_empty(head) && (head->next == head->prev);
}
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * If @old was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+ struct list_head *new)
+{
+ new->next = old->next;
+ new->next->prev = new;
+ new->prev = old->prev;
+ new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+ struct list_head *new)
+{
+ list_replace(old, new);
+ INIT_LIST_HEAD(old);
+}
+
#define list_entry(ptr, type, member) \
((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))