performance/io-threads: Eliminate spinlock contention via fops-per-thread-ratio

Summary: - Background: Frequently spinlock is observed on busy GFS clusters, which wastes CPU and destroys the performance of the cluster. Current solutions to this problem involve under-provisioning the thread pool, but this is problematic as during busy periods there may not be enough threads to service the queue. - This patch introduces a technique to avoid the stampeding herd problem with the io-threads workers. This is done by dynamically tuning the threads by a ratio of threads to queue depth, there-by keeping already running threads sufficiently busy by a tunable FOP to thread ratio. Ratio is controllable by the performanace.io-threads-fops-per-threads-ratio option. - More detailed reading on this approach can be found here: https://h21007.www2.hp.com/portal/download/files/unprot/hpux/MakingConditionVariablesPerform.pdf - Cherry-pick of D2530504 for 3.8 Test Plan: - Stress teston my dev server - shadow testing Reviewed By: moox, sshreyas Signed-off-by: Shreyas Siravara <sshreyas@fb.com> Change-Id: I771ae783aa4ca5a6fd0449db64e07d1f4bff0d04 Reviewed-on: http://review.gluster.org/16080 CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Tested-by: Shreyas Siravara <sshreyas@fb.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Kevin Vigor <kvigor@fb.com>
author: Richard Wareing <rwareing@fb.com> 2015-10-11 02:02:28 -0700
committer: Shreyas Siravara <sshreyas@fb.com> 2016-12-09 09:06:01 -0800
commit: e52d41938468c2fffd9372a0e47ae3b61c8aa965 (patch)
tree: 616e591520ab8cf497a2f7a5527a3050e947320e
parent: 46254326030296c121c8d3d01dc511f792ea8d1d (diff)
3 files changed, 44 insertions, 5 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 1e24adabe0c..7517672de8c 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1461,6 +1461,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .option      = "thread-count",
           .op_version  = 1
         },
+        { .key         = "performance.io-thread-fops-per-thread-ratio",
+          .voltype     = "performance/io-threads",
+          .option      = "fops-per-thread-ratio",
+          .op_version  = 1
+        },
         { .key         = "performance.high-prio-threads",
           .voltype     = "performance/io-threads",
           .op_version  = 1
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index c81a97d8a39..541079e2070 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -202,7 +202,7 @@ iot_worker (void *data)
 						       &conf->mutex, &sleep);
 				pthread_mutex_unlock(&conf->mutex);
 				continue;
-			}
+                        }
                 }
                 pthread_mutex_unlock (&conf->mutex);
 
@@ -228,14 +228,25 @@ int
 do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri)
 {
         int   ret = 0;
+        int   active_count = 0;
 
         pthread_mutex_lock (&conf->mutex);
         {
                 __iot_enqueue (conf, stub, pri);
 
-                pthread_cond_signal (&conf->cond);
-
-                ret = __iot_workers_scale (conf);
+                /* If we have an ample supply of threads alive already
+                 * it's massively more efficient to keep the ones you have
+                 * busy vs making new ones and signaling everyone
+                 */
+                active_count = conf->curr_count - conf->sleep_count;
+                if (conf->fops_per_thread_ratio == 0 || active_count == 0 ||
+                    (conf->queue_size/active_count >
+                     conf->fops_per_thread_ratio &&
+                     active_count < conf->max_count)) {
+                        pthread_cond_signal (&conf->cond);
+
+                        ret = __iot_workers_scale (conf);
+                }
         }
         pthread_mutex_unlock (&conf->mutex);
 
@@ -900,6 +911,9 @@ reconfigure (xlator_t *this, dict_t *options)
 
         GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out);
 
+        GF_OPTION_RECONF ("fops-per-thread-ratio", conf->fops_per_thread_ratio,
+                          options, int32, out);
+
         GF_OPTION_RECONF ("high-prio-threads",
                           conf->ac_iot_limit[IOT_PRI_HI], options, int32, out);
 
@@ -972,6 +986,9 @@ init (xlator_t *this)
 
         GF_OPTION_INIT ("thread-count", conf->max_count, int32, out);
 
+        GF_OPTION_INIT ("fops-per-thread-ratio", conf->fops_per_thread_ratio,
+                        int32, out);
+
         GF_OPTION_INIT ("high-prio-threads",
                         conf->ac_iot_limit[IOT_PRI_HI], int32, out);
 
@@ -1096,6 +1113,20 @@ struct volume_options options[] = {
                          "perform concurrent IO operations"
 
 	},
+        { .key  = {"fops-per-thread-ratio"},
+          .type = GF_OPTION_TYPE_INT,
+          .min  = IOT_MIN_FOP_PER_THREAD,
+          .max  = IOT_MAX_FOP_PER_THREAD,
+          .default_value = "20",
+          .description = "The optimal ratio of threads to FOPs in the queue "
+                         "we wish to achieve before creating a new thread. "
+                         "The idea here is it's far cheaper to keep our "
+                         "currently running threads busy than spin up "
+                         "new threads or cause a stampeding herd of threads "
+                         "to service a singlular FOP when you have a thread "
+                         "which will momentarily become available to do the "
+                         "work."
+        },
 	{ .key  = {"high-prio-threads"},
 	  .type = GF_OPTION_TYPE_INT,
 	  .min  = IOT_MIN_THREADS,
diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h
index d8eea2cf77a..e5c97f690a2 100644
--- a/xlators/performance/io-threads/src/io-threads.h
+++ b/xlators/performance/io-threads/src/io-threads.h
@@ -34,7 +34,9 @@ struct iot_conf;
 
 #define IOT_MIN_THREADS         1
 #define IOT_DEFAULT_THREADS     16
-#define IOT_MAX_THREADS         64
+#define IOT_MAX_THREADS         256
+#define IOT_MIN_FOP_PER_THREAD  0
+#define IOT_MAX_FOP_PER_THREAD  2000
 
 
 #define IOT_THREAD_STACK_SIZE   ((size_t)(1024*1024))
@@ -62,6 +64,7 @@ struct iot_conf {
         pthread_cond_t       cond;
 
         int32_t              max_count;   /* configured maximum */
+        int32_t              fops_per_thread_ratio;
         int32_t              curr_count;  /* actual number of threads running */
         int32_t              sleep_count;
author	Richard Wareing <rwareing@fb.com>	2015-10-11 02:02:28 -0700
committer	Shreyas Siravara <sshreyas@fb.com>	2016-12-09 09:06:01 -0800
commit	e52d41938468c2fffd9372a0e47ae3b61c8aa965 (patch)
tree	616e591520ab8cf497a2f7a5527a3050e947320e
parent	46254326030296c121c8d3d01dc511f792ea8d1d (diff)