cluster/dht: Fix unsafe iteration on inode->fd_list

When DHT traverses the inode->fd_list, it does that in an unsafe way that can generate races with fd_unref() called from other threads. This patch fixes this problem taking the inode->lock and adding a reference to the fd while it's being used outside of the mutex protected region. A minor change in storage/posix has been done to also access the inode->fd_list in a safe way. Change-Id: I10d469ca6a8f76e950a8c9779ae9c8b70f88ef93 BUG: 1344340 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> Reviewed-on: http://review.gluster.org/14682 CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
author: Xavier Hernandez <xhernandez@datalab.es> 2016-06-09 16:53:19 +0200
committer: Raghavendra G <rgowdapp@redhat.com> 2016-06-15 02:04:29 -0700
commit: 4c08d36e7c6f189499f2340eb529b7f4ceff57f6 (patch)
tree: 825d74e65a7bfad50bace21df80cfb2545083814 /xlators/cluster
parent: e740b700975bdb1014ede99c65731095bdf81a72 (diff)
1 files changed, 70 insertions, 16 deletions
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 0a5e1128db1..590d0043507 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -1103,6 +1103,7 @@ dht_migration_complete_check_task (void *data)
         dht_conf_t         *conf        = NULL;
         inode_t            *inode       = NULL;
         fd_t               *iter_fd     = NULL;
+        fd_t               *tmp         = NULL;
         uint64_t            tmp_miginfo = 0;
         dht_migrate_info_t *miginfo     = NULL;
         int                 open_failed = 0;
@@ -1228,27 +1229,46 @@ dht_migration_complete_check_task (void *data)
                 goto out;
         }
 
+        /* perform 'open()' on all the fd's present on the inode */
+        if (tmp_loc.path == NULL) {
+                inode_path (inode, NULL, &path);
+                if (path)
+                        tmp_loc.path = path;
+        }
+
+        LOCK(&inode->lock);
+
         if (list_empty (&inode->fd_list))
-                goto out;
+                goto unlock;
 
         /* perform open as root:root. There is window between linkfile
          * creation(root:root) and setattr with the correct uid/gid
          */
         SYNCTASK_SETID(0, 0);
 
-        /* perform 'open()' on all the fd's present on the inode */
-        tmp_loc.inode = inode;
-        inode_path (inode, NULL, &path);
-        if (path)
-                tmp_loc.path = path;
-
-        list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+        /* It's possible that we are the last user of iter_fd after each
+         * iteration. In this case the fd_unref() of iter_fd at the end of
+         * the loop will cause the destruction of the fd. So we need to
+         * iterate the list safely because iter_fd cannot be trusted.
+         */
+        list_for_each_entry_safe (iter_fd, tmp, &inode->fd_list, inode_list) {
 
                 if (fd_is_anonymous (iter_fd))
                         continue;
+
                 if (dht_fd_open_on_dst (this, iter_fd, dst_node))
                         continue;
 
+                /* We need to release the inode->lock before calling
+                 * syncop_open() to avoid possible deadlocks. However this
+                 * can cause the iter_fd to be released by other threads.
+                 * To avoid this, we take a reference before releasing the
+                 * lock.
+                 */
+                __fd_ref(iter_fd);
+
+                UNLOCK(&inode->lock);
+
                 /* flags for open are stripped down to allow following the
                  * new location of the file, otherwise we can get EEXIST or
                  * truncate the file again as rebalance is moving the data */
@@ -1270,15 +1290,23 @@ dht_migration_complete_check_task (void *data)
                 } else {
                         dht_fd_ctx_set (this, iter_fd, dst_node);
                 }
+
+                fd_unref(iter_fd);
+
+                LOCK(&inode->lock);
         }
 
         SYNCTASK_SETID (frame->root->uid, frame->root->gid);
 
         if (open_failed) {
                 ret = -1;
-                goto out;
+                goto unlock;
         }
         ret = 0;
+
+unlock:
+        UNLOCK(&inode->lock);
+
 out:
 
         loc_wipe (&tmp_loc);
@@ -1351,6 +1379,7 @@ dht_rebalance_inprogress_task (void *data)
         dht_conf_t   *conf            = NULL;
         inode_t      *inode           = NULL;
         fd_t         *iter_fd         = NULL;
+        fd_t         *tmp             = NULL;
         int           open_failed     = 0;
         uint64_t      tmp_miginfo     = 0;
         dht_migrate_info_t *miginfo   = NULL;
@@ -1461,24 +1490,44 @@ dht_rebalance_inprogress_task (void *data)
         }
         ret = 0;
 
+        if (tmp_loc.path == NULL) {
+                inode_path (inode, NULL, &path);
+                if (path)
+                        tmp_loc.path = path;
+        }
+
+        LOCK(&inode->lock);
+
         if (list_empty (&inode->fd_list))
-                goto done;
+                goto unlock;
 
         /* perform open as root:root. There is window between linkfile
          * creation(root:root) and setattr with the correct uid/gid
          */
         SYNCTASK_SETID (0, 0);
 
-        inode_path (inode, NULL, &path);
-        if (path)
-                tmp_loc.path = path;
-
-        list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+        /* It's possible that we are the last user of iter_fd after each
+         * iteration. In this case the fd_unref() of iter_fd at the end of
+         * the loop will cause the destruction of the fd. So we need to
+         * iterate the list safely because iter_fd cannot be trusted.
+         */
+        list_for_each_entry_safe (iter_fd, tmp, &inode->fd_list, inode_list) {
                 if (fd_is_anonymous (iter_fd))
                         continue;
 
                 if (dht_fd_open_on_dst (this, iter_fd, dst_node))
                         continue;
+
+                /* We need to release the inode->lock before calling
+                 * syncop_open() to avoid possible deadlocks. However this
+                 * can cause the iter_fd to be released by other threads.
+                 * To avoid this, we take a reference before releasing the
+                 * lock.
+                 */
+                __fd_ref(iter_fd);
+
+                UNLOCK(&inode->lock);
+
                 /* flags for open are stripped down to allow following the
                  * new location of the file, otherwise we can get EEXIST or
                  * truncate the file again as rebalance is moving the data */
@@ -1501,16 +1550,21 @@ dht_rebalance_inprogress_task (void *data)
                         dht_fd_ctx_set (this, iter_fd, dst_node);
                 }
 
+                fd_unref(iter_fd);
+
+                LOCK(&inode->lock);
         }
 
         SYNCTASK_SETID (frame->root->uid, frame->root->gid);
 
+unlock:
+        UNLOCK(&inode->lock);
+
         if (open_failed) {
                 ret = -1;
                 goto out;
         }
 
-done:
         ret = dht_inode_ctx_set_mig_info (this, inode, src_node, dst_node);
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
author	Xavier Hernandez <xhernandez@datalab.es>	2016-06-09 16:53:19 +0200
committer	Raghavendra G <rgowdapp@redhat.com>	2016-06-15 02:04:29 -0700
commit	4c08d36e7c6f189499f2340eb529b7f4ceff57f6 (patch)
tree	825d74e65a7bfad50bace21df80cfb2545083814 /xlators/cluster
parent	e740b700975bdb1014ede99c65731095bdf81a72 (diff)